I have a large dataset with longitudinal readings from single individuals.
I want to summarise information over time into a binary variable. i.e. if diff in the input table below is >5 for any value I want to then reduce the observation for A to a new column saying TRUE.
#Input
individual val1 val2 diff
A 32 36 -4
A 36 28 8
A 28 26 2
A 26 26 0
B 65 64 1
B 58 59 -1
B 57 54 3
B 54 51 3
#Output
individual newval
A TRUE
B FALSE
Using dplyr you can:
library(dplyr)
df %>%
group_by(individual) %>% # first group data
summarize(newval = any(diff > 5)) # then evaluate test for each group
#> # A tibble: 2 x 2
#> individual newval
#> <fct> <lgl>
#> 1 A TRUE
#> 2 B FALSE
data
df <- read.table(text = "individual val1 val2 diff
A 32 36 -4
A 36 28 8
A 28 26 2
A 26 26 0
B 65 64 1
B 58 59 -1
B 57 54 3
B 54 51 3
", header = TRUE)
Multiple ways to do this :
In base R we can use aggregate
aggregate(diff~individual, df,function(x) any(x>5))
# individual diff
#1 A TRUE
#2 B FALSE
Or tapply
tapply(df$diff > 5, df$individual, any)
We can also use data.table
library(data.table)
setDT(df)[ ,(newval = any(diff > 5)), by = individual]
An option in base R with rowsum
rowsum(+(df1$diff > 5), df1$individual) != 0
or with by
by(df1$diff > 5, df1$individual, any)
data
df1 <- structure(list(individual = c("A", "A", "A", "A", "B", "B", "B",
"B"), val1 = c(32L, 36L, 28L, 26L, 65L, 58L, 57L, 54L), val2 = c(36L,
28L, 26L, 26L, 64L, 59L, 54L, 51L), diff = c(-4L, 8L, 2L, 0L,
1L, -1L, 3L, 3L)), class = "data.frame", row.names = c(NA, -8L
))
Related
How can I get the index of the sample whose previous samples were consecutive in a window or range and were greater than a fixed threshold in groups?
In the below example, I need to find the time when I have consecutively 3 samples in a window that starts from 3rd element to the end of the array, and also whose speed is greater than 35 speed >= 35 group-wise
speed_threshold = 35
Group Time Speed
1 5 25 # Ignore first 3 elements
1 10 23 # Ignore first 3 elements
1 15 21 # Ignore first 3 elements
1 20 33 # Speed < 35
1 25 40 # Speed > 35
1 30 42 # Speed > 35
1 35 52 # Speed > 35
1 40 48 # <--- Return time = 40 as answer for Group 1 !
1 45 52
2 5 48 # Ignore first 3 elements
2 10 42 # Ignore first 3 elements
2 15 39 # Ignore first 3 elements
2 20 36 # Speed > 35
2 25 38 # Speed > 35
2 30 46 # Speed > 35
2 35 53 # <--- Return time = 35 as answer for Group 2 !
3 5 45 # Ignore first 3 elements
3 10 58 # <--- Return time = NA as answer for group 3 !
The solution I have tried is as follows using data.table -
df[, {above <- Speed[-(1:3)] > speed_thresh
ends <- which(above & rowid(rleid(above)) == 3)
.(Return_Time = Time[ends[1]+ 1])}
, Group]
The above solution removes the first three elements from the entire array, and not remove the first three elements in each group, how can I ignore the first three elements in each group and then find the consecutive integers exceeding the threshold?
Thanks in advance!
Note
Lines <- "Group Time Speed
1 5 25 # Ignore first 3 elements
1 10 23 # Ignore first 3 elements
1 15 21 # Ignore first 3 elements
1 20 33 # Speed < 35
1 25 40 # Speed > 35
1 30 42 # Speed > 35
1 35 52 # Speed > 35
1 40 48 # <--- Return time = 40 as answer for Group 1 !
1 45 52
2 5 48 # Ignore first 3 elements
2 10 42 # Ignore first 3 elements
2 15 39 # Ignore first 3 elements
2 20 36 # Speed > 35
2 25 38 # Speed > 35
2 30 46 # Speed > 35
2 35 53 # <--- Return time = 35 as answer for Group 2 !
3 5 45 # Ignore first 3 elements
3 10 58 # <--- Return time = NA as answer for group 3 !"
df <- read.table(text = Lines, header = TRUE)
Here is a tidyverse solution...
library(dplyr)
speed_threshold <- 35
df %>% group_by(Group) %>%
mutate(ind = cumsum(Speed >= speed_threshold), #nunber exceeding threshhold
ind = ind - lag(ind, 3, default = 0), #compared to 3 previous
ind = lag(ind, default = 0) == 3) %>% #mark one after where this hits 3
summarise(Return_Time = Time[max(7,which(ind)[1])]) #has to be at least the 7th value
# A tibble: 3 x 2
Group Return_Time
<int> <int>
1 1 40
2 2 35
3 3 NA
library(data.table)
setDT(df)
speed_thresh <- 35
df[, {
window <- 4:.N
above <- Speed[window] >= speed_thresh
ends <- which(above & rowid(rleid(above)) == 3)
.(Return_Time = Time[window][ends[1] + 1])
}
, Group]
#> Group Return_Time
#> 1: 1 40
#> 2: 2 35
#> 3: 3 NA
Let the ith value of roll be TRUE if the last 3 of 6 values ending at index i all exceed 35. Then find the first TRUE in each group, add 1 and index that into Time.
library(data.table)
library(zoo)
roll <- function(x) rollapplyr(x, 6, function(x) all(tail(x, 3) > 35), fill = FALSE)
DT[, list(Time = Time[which(roll(Speed))[1] + 1]), by = Group]
giving
Group Time
1: 1 40
2: 2 35
3: 3 NA
Note
DF <- structure(list(Group = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L), Time = c(5L, 10L, 15L, 20L,
25L, 30L, 35L, 40L, 45L, 5L, 10L, 15L, 20L, 25L, 30L, 35L, 5L,
10L), Speed = c(25L, 23L, 21L, 33L, 40L, 42L, 52L, 48L, 52L,
48L, 42L, 39L, 36L, 38L, 46L, 53L, 45L, 58L)), row.names = c(NA,
-18L), class = "data.frame")
library(data.table)
DT <- as.data.table(DF)
Note 2
The poster indicated that they have old outdated versions of all packages and R and are restricted in installing packages so use this base R version of roll instead.
roll <- function(x) {
f <- function(i) if (i < 6) FALSE else all(x[seq(to = i, length = 3)] > 35)
sapply(seq_along(x), f)
}
I've got the dataframe below and trying to compute if there is a significant difference in the proportions between the groups within each category. E.g. category A group 1 verus 2, 1 versus 3, and 2 versus 3.
Is there a way to calculate and add the p-values to the dataframe as new columns without having to calculate it and add it manually one row at a time?
Or is there a way to calculate them and store them in a separate data frame?
Group Category number min total Proportion
1 1 A 6 2.5 33 0.1818182
2 1 B 4 3.2 33 0.1212121
3 1 C 16 3.2 33 0.4848485
4 1 D 7 3.1 33 0.2121212
5 2 A 22 6.4 133 0.1654135
6 2 B 17 6.7 133 0.1278195
7 2 C 56 6.0 133 0.4210526
8 2 D 38 6.4 133 0.2857143
9 3 A 3 10.0 22 0.1363636
10 3 B 3 9.7 22 0.1363636
11 3 C 9 10.6 22 0.4090909
12 3 D 7 9.9 22 0.3181818
The solution is quite complicated although it looks like an easy task. Here is the solution using the purrr package as the core tool.
Let's import data:
data <- structure(list(Group = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), Category = c("B", "C", "D", "A", "B", "C", "D", "A",
"B", "C", "D"), number = c(4L, 16L, 7L, 22L, 17L, 56L, 38L, 3L,
3L, 9L, 7L), min = c(3.2, 3.2, 3.1, 6.4, 6.7, 6, 6.4, 10, 9.7,
10.6, 9.9), total = c(33L, 33L, 33L, 133L, 133L, 133L, 133L,
22L, 22L, 22L, 22L), Proportion = c(0.1212121, 0.4848485, 0.2121212,
0.1654135, 0.1278195, 0.4210526, 0.2857143, 0.1363636, 0.1363636,
0.4090909, 0.3181818)), row.names = 2:12, class = "data.frame")
and required packages:
library(dplyr) # mutate, group_by and rowwise functions
library(tidyr) # nest
library(purrr) # map
library(combinat) # combn
We will create tibble object foo which divides original dataset according to groups. That allows us to map function to the groups.
foo <- foo %>% mutate(tab = map(data, combFun))
Now we define own function combPval which 1) creates a data.frame of combinations of factors (combTab), 2) creates data.frame tab1 which stores relevant columns for prop.test. These data.frames are merged in subsequent steps to create data.frame data. prop.test is then applied by in a rowwise way.
combPval <- function(group){
combTab <- combn(unique(group$Category), 2) %>% t() %>% data.frame()
tab1 <- group %>% select(Category, number, total)
combTab
temp <- merge(y=combTab, x=tab1, by.y="X2", by.x="Category" )
data <- merge(y=temp, x=tab1, by.y="X1", by.x="Category")
data <- data %>%
rowwise() %>%
mutate(
pval = prop.test(x=c(number.x, number.y), n=c(total.x, total.y))$p.val
)
data
}
Function combPval is applied in the following way:
foo <- foo %>% mutate(results = map(data, combPval))
Results for the first group can be obtained:
foo$results[[1]]
# A tibble: 3 x 7
# Rowwise:
Category number.x total.x Category.y number.y total.y pval
<chr> <int> <int> <chr> <int> <int> <dbl>
1 B 4 33 C 16 33 0.00322
2 B 4 33 D 7 33 0.509
3 C 16 33 D 7 33 0.0388
I have an example dataset:
Road Start End Cat
1 0 50 a
1 50 60 b
1 60 90 b
1 70 75 a
2 0 20 a
2 20 25 a
2 25 40 b
Trying to output following:
Road Start End Cat
1 0 50 a
1 50 90 b
1 70 75 a
2 0 25 a
2 25 40 b
My code doesn't work:
df %>% group_by(Road, cat)
%>% summarise(
min(Start),
max(End)
)
How can I achieve the results I wanted?
We can use rleid from data.table to get the run-length-id-encoding for grouping and then do the summarise
library(dplyr)
library(data.table)
df %>%
group_by(Road, grp = rleid(Cat)) %>%
summarise(Cat = first(Cat), Start = min(Start), End = max(End)) %>%
select(-grp)
# A tibble: 5 x 4
# Groups: Road [2]
# Road Cat Start End
# <int> <chr> <int> <int>
#1 1 a 0 50
#2 1 b 50 90
#3 1 a 70 75
#4 2 a 0 25
#5 2 b 25 40
Or using data.table methods
library(data.table)
setDT(df)[, .(Start = min(Start), End = max(End)), .(Road, Cat, grp = rleid(Cat))]
data
df <- structure(list(Road = c(1L, 1L, 1L, 1L, 2L, 2L, 2L), Start = c(0L,
50L, 60L, 70L, 0L, 20L, 25L), End = c(50L, 60L, 90L, 75L, 20L,
25L, 40L), Cat = c("a", "b", "b", "a", "a", "a", "b")),
class = "data.frame", row.names = c(NA,
-7L))
The pattern my data is like this
df1<-read.table(text="Car1 Car2 Car3 Time1 Time2 Time3
22 33 90 20 90 20
11 45 88 10 80 30
22 33 40 40 10 10
11 45 40 10 10 40
11 45 88 10 12 60
22 45 90 60 20 100",header=TRUE)
I want to calculate mean and SD based on Car and time. The point is Car 1 corresponds to Time1, Car2 corresponds to Time 2 and Car3 Corresponds to Time3 and so on.
I want to get the following table :
Car1 Mean SD
11 10 0
22 40 20
Car2
33 xx xx
45 xx xx
Car3
40 xx xx
88 xx xx
90 xx xx
I have tried:
df1 %>% group_by(Car1,Car2,Car3) %>%
summarise(mean=mean(Time,SD=sd(Time))
Unfortunately, it does not work. Any help?
You can also use the package data.table:
library(data.table)
melt(setDT(df1),
measure = patterns("Car", "Time"),
value.name = c("Car", "Time"),
variable.name = "group"
)[, .(Mean = mean(Time), Sd = sd(Time)), .(group, Car)]
# group Car Mean Sd
# 1: 1 22 40.0 20.00000
# 2: 1 11 10.0 0.00000
# 3: 2 33 50.0 56.56854
# 4: 2 45 30.5 33.28163
# 5: 3 90 60.0 56.56854
# 6: 3 88 45.0 21.21320
# 7: 3 40 25.0 21.21320
Here is one option with pivot_longer where we reshape from 'wide' to 'long' format and group by the 'group1' index and 'Car', get the mean and sd of 'Time' by summariseing the 'Time'
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(cols = everything(), names_to = c(".value", "group"),
names_sep="(?<=[a-z])(?=\\d+)") %>%
group_by(group, Car) %>%
summarise(Mean = mean(Time), SD = sd(Time))
# A tibble: 7 x 4
# Groups: group [3]
# group Car Mean SD
# <chr> <int> <dbl> <dbl>
#1 1 11 10 0
#2 1 22 40 20
#3 2 33 50 56.6
#4 2 45 30.5 33.3
#5 3 40 25 21.2
#6 3 88 45 21.2
#7 3 90 60 56.6
Assuming you can easily segregate your data into Time and Cars, then you can do this using loop, assuming you have data into structure as provided by you.
cars <- df1[1:3]
Time <- df1[4:6]
ls <- list()
for(i in 1:ncol(cars)) {
ls[[i]] <- aggregate(Time[i], by = cars[i], FUN = function(x) c(mean(x), sd(x)))
}
ls
Data for the results is:
df1 <- structure(list(Car1 = c(22L, 11L, 22L, 11L, 11L, 22L), Car2 = c(33L,
45L, 33L, 45L, 45L, 45L), Car3 = c(90L, 88L, 40L, 40L, 88L, 90L
), Time1 = c(20L, 10L, 40L, 10L, 10L, 60L), Time2 = c(90L, 80L,
10L, 10L, 12L, 20L), Time3 = c(20L, 30L, 10L, 40L, 60L, 100L)), class = "data.frame", row.names = c(NA,
-6L))
lapply(split.default(df1, gsub("\\D+", "", names(df1))), function(x){
d = gsub("\\D+", "", names(x)[1])
x %>%
group_by(!!sym(paste0("Car", d))) %>%
summarise(mean = mean(!!sym(paste0("Time", d))),
sd = sd(!!sym(paste0("Time", d)))) %>%
ungroup()
})
Input Data
ID value
a 10
a 12
a 18
a 13
b 23
b 25
b 33
c 17
c 23
c 27
OUTPUT data Should be look like
ID value Diff
a 10 0
a 12 2
a 18 8
a 13 3
b 23 0
b 25 2
b 33 10
c 17 0
c 23 6
c 27 10
i got this code from net
library(data.table)
DT <- as.data.table(dat)
DT[, `:=`(DIFTIME, c(0, diff(as.Date(DATETIME)))), by = "ID"]
but this only create difference between two successive row not from the first instance of that group
dat<-structure(list(ID = c(1L, 1L, 1L, 1L, 2L, 2L, 3L, 3L),
DATETIME = structure(c(1328346000,1328479200, 1331024400,1331025400, 1328086800, 1328184000, 1336287600, 1336424400),
class = c("POSIXct", "POSIXt"), tzone = ""),
VALUE = c(300L,150L, 650L, 450L, 855L, 240L, 340L, 240L)),
.Names = c("ID", "DATETIME","VALUE"), class = "data.frame", row.names = c(NA, 7L))
You could also use dplyr, where df is the original data
library(dplyr)
group_by(df, ID) %>% mutate(Diff = value - first(value))
# ID value Diff
# 1 a 10 0
# 2 a 12 2
# 3 a 18 8
# 4 a 13 3
# 5 b 23 0
# 6 b 25 2
# 7 b 33 10
# 8 c 17 0
# 9 c 23 6
# 10 c 27 10
using data.table
setDT(df)[, Diff:=value-value[1], by=ID]
df
# ID value Diff
#1: a 10 0
#2: a 12 2
#3: a 18 8
#4: a 13 3
#5: b 23 0
#6: b 25 2
#7: b 33 10
#8: c 17 0
#9: c 23 6
#10: c 27 10
data
df <- structure(list(ID = c("a", "a", "a", "a", "b", "b", "b", "c",
"c", "c"), value = c(10L, 12L, 18L, 13L, 23L, 25L, 33L, 17L,
23L, 27L)), .Names = c("ID", "value"), class = "data.frame", row.names = c(NA,
-10L))
You can do it in base R using the ave function.
dat$Diff <- ave( dat$value, dat$ID, FUN = function(x) x - x[1] )