calculate number of rows in a dataframe above threshold as a function or other column factors - r

I would like to get find the number for rows for each subject for each day where the value is greater than 11 and output these in a data frame for analysis. The data set is large (5000 rows) so need a function for this.
subject = c(rep("A", 12), rep("B", 12))
day = c(1,1,1,1,2,2,2,2,3,3,3,3,1,1,1,1,2,2,2,2,3,3,3,3)
value = c(13,14,15,5,12,9,6,14,4,2,1,2,13,14,15,5,12,9,6,14,2,2,2,3)
df = data.frame(subject, day, value)
df
subject day value
1 A 1 13
2 A 1 14
3 A 1 15
4 A 1 5
5 A 2 12
6 A 2 9
7 A 2 6
8 A 2 14
9 A 3 4
10 A 3 2
11 A 3 1
12 A 3 2
13 B 1 13
14 B 1 14
15 B 1 15
16 B 1 5
17 B 2 12
18 B 2 9
19 B 2 6
20 B 2 14
21 B 3 2
22 B 3 2
23 B 3 2
24 B 3 3
The output I would like would be
subject.agg = c(rep("A", 3), rep("B", 3))
day.agg = as.factor(c(1,2,3,1,2,3))
highvalues = (c(3,2,0,3,2,0))
df.agg = data.frame(subject.agg,day.agg,highvalues)
df.agg
subject.agg day.agg highvalues
1 A 1 3
2 A 2 2
3 A 3 0
4 B 1 3
5 B 2 2
6 B 3 0

One option is aggregate from base R
aggregate(cbind(highvalues=value>11)~., df, sum)
Or with data.table
library(data.table)
setDT(df)[value>11, .(highvalues=.N), by = .(subject, day)]
# subject day highvalues
#1: A 1 3
#2: A 2 2
#3: A 3 3
#4: B 1 3
#5: B 2 2
#6: B 3 3

You can go the tidyverse way:
df %>%
filter(value > 11) %>%
group_by(subject,day) %>%
mutate(highvalue = n()) %>%
select(subject, day, highvalue) %>%
unique()

library(data.table)
dt = setDT(df)
dt[, sum(value>11),by = .(subject,day)]
subject day V1
1: A 1 3
2: A 2 2
3: A 3 3
4: B 1 3
5: B 2 2
6: B 3 3

Related

Replace all subsequent column values, after the first instance of a value greater than x

I have a dataframe (df1) with two columns, one (grp) is a grouping variable, the second (num) has some measurements.
For each group I want to:
replace all numbers greater than 3.5 with 4
replace all numbers after the first instance of 4 with 4
I just want to get to step 2, but step 1 seems like a logical starting point, maybe it isn't required though?
Example data
library(dplyr)
df1 <- data.frame(
grp = rep(c("a", "b"), each = 10),
num = c(0,1,2,5,0,1,7,0,2,1,2,2,2,2,5,0,0,0,0,6))
I can get the first part:
df1 %>%
group_by(grp) %>%
mutate(num = ifelse(num > 3.5, 4, num))
For the second part I tried using dplyr::lag and dplyr::case_when but no luck. Here is the desired output:
grp num
1 a 0
2 a 1
3 a 2
4 a 4
5 a 4
6 a 4
7 a 4
8 a 4
9 a 4
10 a 4
11 b 2
12 b 2
13 b 2
14 b 2
15 b 4
16 b 4
17 b 4
18 b 4
19 b 4
20 b 4
Any advice would be much appreciated.
You could use cumany() to find all cases after the first event, i.e. num > 3.5.
library(dplyr)
df1 %>%
group_by(grp) %>%
mutate(num2 = replace(num, cumany(num > 3.5), 4)) %>%
ungroup()
# A tibble: 20 × 3
grp num num2
<chr> <dbl> <dbl>
1 a 0 0
2 a 1 1
3 a 2 2
4 a 5 4
5 a 0 4
6 a 1 4
7 a 7 4
8 a 0 4
9 a 2 4
10 a 1 4
11 b 2 2
12 b 2 2
13 b 2 2
14 b 2 2
15 b 5 4
16 b 0 4
17 b 0 4
18 b 0 4
19 b 0 4
20 b 6 4
You can also replace cumany(num > 3.5) with cumsum(num > 3.5) > 0.

Keep rows with specific string and the following row

This is my data frame
df <- data.frame(
id = 1:14,
group_id = c(rep(1:2, each = 3), rep(3:4, each = 4)),
type = rep("A", 14), stringsAsFactors = FALSE)
df[c(2,4,8,12),"type"] <- "B"
id group_id type
1 1 1 A
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
6 6 2 A
7 7 3 A
8 8 3 B
9 9 3 A
10 10 3 A
11 11 4 A
12 12 4 B
13 13 4 A
14 14 4 A
I'd like to keep all rows with type B as well as the following row.
I could do...
B <- which(df$type=="B")
afterB <- B+1
df_sel <- df[c(B, afterB), ]
df_sel <- df_sel[order(df_sel$id),]
df_sel
...to get what I want.
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A
How can this be done in a more generic way.
Another way, very similar to what you do but in one step and without the need to reorder:
df_sel <- df[rep(which(df$type=="B"), e=2)+c(0, 1), ]
df_sel
# id group_id type
# 2 2 1 B
# 3 3 1 A
# 4 4 2 B
# 5 5 2 A
# 8 8 3 B
# 9 9 3 A
# 12 12 4 B
# 13 13 4 A
Using lag from dplyr
library(dplyr)
df[df$type == "B" | lag(df$type == "B", default = FALSE), ]
# id group_id type
#2 2 1 B
#3 3 1 A
#4 4 2 B
#5 5 2 A
#8 8 3 B
#9 9 3 A
#12 12 4 B
#13 13 4 A
using grep will provide a row index of all instances of B - rows; concatenate (c()) this with rows + 1 to select from df will work.
rows <- grep("B", df[, "type"])
df[sort(c(rows, rows + 1)), ]
gives:
id group_id type
2 2 1 B
3 3 1 A
4 4 2 B
5 5 2 A
8 8 3 B
9 9 3 A
12 12 4 B
13 13 4 A

Extract Index of repeat value

how do I extract specific row of data when the column has repetitive value? my data looks like this: I want to extract the row of the end of each repeat of x (A 3 10, A 2 3 etc) or the index of the last value
Name X M
A 1 1
A 2 9
A 3 10
A 1 1
A 2 3
A 1 5
A 2 6
A 3 4
A 4 5
A 5 3
B 1 1
B 2 9
B 3 10
B 1 1
B 2 3
Expected output
Index Name X M
3 A 3 10
5 A 2 3
10 A 5 3
13 B 3 10
15 B 2 3
Using base R duplicated and cumsum:
dups <- !duplicated(cumsum(dat$X == 1), fromLast=TRUE)
cbind(dat[dups,], Index=which(dups))
# Name X M Index
#3 A 3 10 3
#5 A 2 3 5
#10 A 5 3 10
#13 B 3 10 13
#15 B 2 3 15
A solution using dplyr.
library(dplyr)
df2 <- df %>%
mutate(Flag = ifelse(lead(X) < X, 1, 0)) %>%
mutate(Index = 1:n()) %>%
filter(Flag == 1 | is.na(Flag)) %>%
select(Index, X, M)
df2
# Index X M
# 1 3 3 10
# 2 5 2 3
# 3 10 5 3
# 4 13 3 10
# 5 15 2 3
Flag is a column showing if the next number in A is smaller than the previous number. If TRUE, Flag is 1, otherwise is 0. We can then filter for Flag == 1 or where Flag is NA, which is the last row. df2 is the final filtered data frame.
DATA
df <- read.table(text = "Name X M
A 1 1
A 2 9
A 3 10
A 1 1
A 2 3
A 1 5
A 2 6
A 3 4
A 4 5
A 5 3
B 1 1
B 2 9
B 3 10
B 1 1
B 2 3",
header = TRUE, stringsAsFactors = FALSE)

Get all combinations of a variable and their corresponding values in a grouped data set

My data looks like this:
mydata <- data.frame(id = c(1,1,1,2,2,3,3,3,3),
subid = c(1,2,3,1,2,1,2,3,4),
time = c(16, 18, 20, 10, 11, 7, 9, 10, 11))
id subid time
1 1 1 16
2 1 2 18
3 1 3 20
4 2 1 10
5 2 2 11
6 3 1 7
7 3 2 9
8 3 3 10
9 3 4 11
My goal is to transform the data to:
newdata <- data.frame(id = c(1,1,1,2,3,3,3,3,3,3),
subid.1 = c(1,1,2,1,1,1,1,2,2,3),
subid.2 = c(2,3,3,2,2,3,4,3,4,4),
time.1 = c(16,16,18,10,7,7,7,9,9,10),
time.2 = c(18,20,20,11,9,10,11,10,11,11))
id subid.1 subid.2 time.1 time.2
1 1 1 2 16 18
2 1 1 3 16 20
3 1 2 3 18 20
4 2 1 2 10 11
5 3 1 2 7 9
6 3 1 3 7 10
7 3 1 4 7 11
8 3 2 3 9 10
9 3 2 4 9 11
10 3 3 4 10 11
So it's not a simple reshape from long-to-wide procedure: The idea is, within groups defined by id, to take all possible combinations of
subid's and their corresponding time values, and get those into a wide format.
I know I can get all possible combinations using, for example gtools::combinations. The first group consists of 3 rows, so
gtools::combinations(n=3, r=2)
gives me the matrix of the new subid.1 and subid.2 pair for group id==1:
[,1] [,2]
[1,] 1 2
[2,] 1 3
[3,] 2 3
But then I don't know how to proceed (neither to reshape the group with id==1 to this format, nor how to do that separately for each group). Thank you!
with base R:
subset(merge(mydata, mydata, by="id", suffix=c(".1",".2")), subid.1 < subid.2)
# id subid.1 time.1 subid.2 time.2
# 1 1 1 16 2 18
# 2 1 1 16 3 20
# 3 1 2 18 3 20
# 4 2 1 10 2 11
# 5 3 1 7 2 9
# 6 3 1 7 3 10
# 7 3 1 7 4 11
# 8 3 2 9 3 10
# 9 3 2 9 4 11
# 10 3 3 10 4 11
dplyr version:
mydata %>% inner_join(.,.,by="id",suffix=c(".1",".2")) %>% filter(subid.1 < subid.2)
data.table version :
setDT(mydata)
mydata[mydata, on="id", allow.cartesian=TRUE][subid < i.subid]
# id subid time i.subid i.time
# 1: 1 1 16 2 18
# 2: 1 1 16 3 20
# 3: 1 2 18 3 20
# 4: 2 1 10 2 11
# 5: 3 1 7 2 9
# 6: 3 1 7 3 10
# 7: 3 2 9 3 10
# 8: 3 1 7 4 11
# 9: 3 2 9 4 11
# 10: 3 3 10 4 11
or to get your column names right, but it kills the fun of a short solution :).
merge(mydata, mydata, by="id", suffix=c(".1",".2"), allow.cartesian=TRUE)[subid.1 < subid.2]
Forgot to state that I came up with this rather lame 4-step solution:
step1 <- lapply(unique(mydata$id), function(x) {
nrows <- nrow(mydata[which(mydata$id == x), ])
combos <- gtools::combinations(n=nrows, r=2)
return(as.data.frame(cbind(x, combos)))
})
step2 <- dplyr::bind_rows(step1)
step3a <- merge(step2, mydata, by.x = c("x", "V2"), by.y = c("id", "subid"))
step3b <- merge(step3a, mydata, by.x = c("x", "V3"), by.y = c("id", "subid"))
step4 <- step3b[, c(1, 3, 2, 4, 5)]
names(step4) <- c("id", "subid.1", "subid.2", "time.1", "time.2")
It's ugly but works.
Using the data.table-package:
library(data.table)
setDT(mydata)[, .(subid = c(t(combn(subid, 2)))), by = id
][, grp := rep(1:2, each = .N/2), by = id
][mydata, on = .(id, subid), time := time
][, dcast(.SD, id + rowid(grp) ~ grp, value.var = list('subid','time'), sep = '.')]
which gives you:
id grp subid.1 subid.2 time.1 time.2
1: 1 1 1 2 16 18
2: 1 2 1 3 16 20
3: 1 3 2 3 18 20
4: 2 4 1 2 10 11
5: 3 5 1 2 7 9
6: 3 6 1 3 7 10
7: 3 7 1 4 7 11
8: 3 8 2 3 9 10
9: 3 9 2 4 9 11
10: 3 10 3 4 10 11

How to find difference between values in two rows in an R dataframe using dplyr

I have an R dataframe such as:
df <- data.frame(period=rep(1:4,2),
farm=c(rep('A',4),rep('B',4)),
cumVol=c(1,5,15,31,10,12,16,24),
other = 1:8);
period farm cumVol other
1 1 A 1 1
2 2 A 5 2
3 3 A 15 3
4 4 A 31 4
5 1 B 10 5
6 2 B 12 6
7 3 B 16 7
8 4 B 24 8
How do I find the change in cumVol at each farm in each period, ignoring the 'other' column? I would like a dataframe like this (optionally with the cumVol column remaining):
period farm volume other
1 1 A 0 1
2 2 A 4 2
3 3 A 10 3
4 4 A 16 4
5 1 B 0 5
6 2 B 2 6
7 3 B 4 7
8 4 B 8 8
In practice there may be many 'farm'-like columns, and many 'other'-like (ie. ignored) columns. I'd like to be able to specify all the column names using variables.
I am using the dplyr package.
In dplyr:
require(dplyr)
df %>%
group_by(farm) %>%
mutate(volume = cumVol - lag(cumVol, default = cumVol[1]))
Source: local data frame [8 x 5]
Groups: farm
period farm cumVol other volume
1 1 A 1 1 0
2 2 A 5 2 4
3 3 A 15 3 10
4 4 A 31 4 16
5 1 B 10 5 0
6 2 B 12 6 2
7 3 B 16 7 4
8 4 B 24 8 8
Perhaps the desired output should actually be as follows?
df %>%
group_by(farm) %>%
mutate(volume = cumVol - lag(cumVol, default = 0))
period farm cumVol other volume
1 1 A 1 1 1
2 2 A 5 2 4
3 3 A 15 3 10
4 4 A 31 4 16
5 1 B 10 5 10
6 2 B 12 6 2
7 3 B 16 7 4
8 4 B 24 8 8
Edit: Following up on your comments I think you are looking for arrange(). It that is not the case it might be best to start a new question.
df1 <- data.frame(period=rep(1:4,4), farm=rep(c(rep('A',4),rep('B',4)),2), crop=(c(rep('apple',8), rep('pear',8))), cumCropVol=c(1,5,15,31,10,12,16,24,11,15,25,31,20,22,26,34), other = rep(1:8,2) );
df1 %>%
arrange(desc(period), desc(farm)) %>%
group_by(period, farm) %>%
summarise(cumVol=sum(cumCropVol))
Edit: Follow up #2
df1 <- data.frame(period=rep(1:4,4), farm=rep(c(rep('A',4),rep('B',4)),2), crop=(c(rep('apple',8), rep('pear',8))), cumCropVol=c(1,5,15,31,10,12,16,24,11,15,25,31,20,22,26,34), other = rep(1:8,2) );
df <- df1 %>%
arrange(desc(period), desc(farm)) %>%
group_by(period, farm) %>%
summarise(cumVol=sum(cumCropVol))
ungroup(df) %>%
arrange(farm) %>%
group_by(farm) %>%
mutate(volume = cumVol - lag(cumVol, default = 0))
Source: local data frame [8 x 4]
Groups: farm
period farm cumVol volume
1 1 A 12 12
2 2 A 20 8
3 3 A 40 20
4 4 A 62 22
5 1 B 30 30
6 2 B 34 4
7 3 B 42 8
8 4 B 58 16
In dplyr -- so you don't have to replace NAs
library(dplyr)
df %>%
group_by(farm)%>%
mutate(volume = c(0,diff(cumVol)))
period farm cumVol other volume
1 1 A 1 1 0
2 2 A 5 2 4
3 3 A 15 3 10
4 4 A 31 4 16
5 1 B 10 5 0
6 2 B 12 6 2
7 3 B 16 7 4
8 4 B 24 8 8
Would creating a new column in your original dataset be an option?
Here is an option using the data.table operator :=.
require("data.table")
DT <- data.table(df)
DT[, volume := c(0,diff(cumVol)), by="farm"]
or
diff_2 <- function(x) c(0,diff(x))
DT[, volume := diff_2(cumVol), by="farm"]
Output:
# > DT
# period farm cumVol other volume
# 1: 1 A 1 1 0
# 2: 2 A 5 2 4
# 3: 3 A 15 3 10
# 4: 4 A 31 4 16
# 5: 1 B 10 5 0
# 6: 2 B 12 6 2
# 7: 3 B 16 7 4
# 8: 4 B 24 8 8
tapply and transform?
> transform(df, volumen=unlist(tapply(cumVol, farm, function(x) c(0, diff(x)))))
period farm cumVol other volumen
A1 1 A 1 1 0
A2 2 A 5 2 4
A3 3 A 15 3 10
A4 4 A 31 4 16
B1 1 B 10 5 0
B2 2 B 12 6 2
B3 3 B 16 7 4
B4 4 B 24 8 8
ave is a better option, see # thelatemail's comment
with(df, ave(cumVol,farm,FUN=function(x) c(0,diff(x))) )

Resources