I currently have the following data format:
df = data.frame(c(rep("A", 12), rep("B", 12)), rep(1:12, 2), seq(-12, 11))
colnames(df) = c("station", "month", "mean")
df
df_master = data.frame(c(rep("A", 10), rep("B", 10)), rep(c(27:31, 1:5), 2), rep(c(rep(1, 5), rep(2, 5)), 2), rep(seq(-4,5), 2))
colnames(df_master) = c("station", "day", "month", "value")
df_master
Effectively df is a monthly average value for each station and I want to compute a new variable in the df_master data set which computes the difference from the monthly mean for each daily observation. I have managed to do this with an overall average incuding all the data, but since the mean values vary from each station so I would like to make the new variable station specific.
I have tried the following code to match the monthly value, but this currently doesn't account for cross station differences:
df_master$mean = df$mean[match(df_master$month, df$month)]
df_master = df_master %>% mutate(diff = value - mean)
How can I progress this further so that the averages are taken per station?
With dplyr using a left join
library(dplyr)
left_join(df_master, df, by = c('station', 'month')) %>%
mutate(monthdiff = value - mean) %>%
select(-mean)
If you convert them to data.tables, you can add the difference column with an update join, joining df_master with df on the condition that the values for both station and month are equal.
library(data.table)
setDT(df_master)
setDT(df)
df_master[df, on = .(station, month),
diff_monthmean := value - i.mean]
df_master
# station day month value diff_monthmean
# 1: A 27 1 -4 8
# 2: A 28 1 -3 9
# 3: A 29 1 -2 10
# 4: A 30 1 -1 11
# 5: A 31 1 0 12
# 6: A 1 2 1 12
# 7: A 2 2 2 13
# 8: A 3 2 3 14
# 9: A 4 2 4 15
# 10: A 5 2 5 16
# 11: B 27 1 -4 -4
# 12: B 28 1 -3 -3
# 13: B 29 1 -2 -2
# 14: B 30 1 -1 -1
# 15: B 31 1 0 0
# 16: B 1 2 1 0
# 17: B 2 2 2 1
# 18: B 3 2 3 2
# 19: B 4 2 4 3
# 20: B 5 2 5 4
Another option could be:
transform(df_master,
diff = value - merge(df_master, df, by = c('station', 'month'), all.x = TRUE)$mean)
Or, using match with interaction
transform(df_master,
diff = value - df$mean[match(interaction(df_master[c("month", "station")]), interaction(df[c("month", "station")]))])
Related
DATA = data.frame("GROUP" = sort(rep(1:4, 200)),
"TYPE" = rep(1:2, 400),
"TIME" = rep(100:101, 400),
"SCORE" = sample(1:100,r=T,800))
Cheers all,
I have 'DATA' and wish to estimation the CORRELATION VALUES of SCORE at each TIME and SCORE and TYPE combination BETWEEN AND WITHIN GROUP in this way:
I am assuming you want to compute the correlation between groups 1-2, 1-3, 1-4 and so on for each combination of TIME and TYPE. Here's an approach:
# create the dataset
set.seed(123)
df <- data.frame("group" = sort(rep(1:4, 200)),
"type" = rep(1:2, 400),
"time" = rep(100:101, 400),
"score" = sample(1:100,r=T,800))
library(tidyr)
library(purrr)
library(data.table)
# another dataset to filter combinations
# (G1G2 is same G2G1, so remove G2G1)
df2 <- combn(4, 2) %>% t %>%
as_tibble() %>%
rename(group1 = V1, group2 = V2) %>%
mutate(value = TRUE)
df %>%
# add identifiers per group
group_by(time, type, group) %>%
mutate(id = row_number()) %>%
ungroup() %>%
# nest data to get separate tibble for each
# combination of time and type
nest(data = -c(time, type)) %>%
# convert each data.frame to data.table
mutate(dt = map(data, function(dt){
setDT(dt)
setkey(dt, id)
dt
})) %>%
# correlation between groups in R
# refer answer below for more details
# https://stackoverflow.com/a/26357667/15221658
# cartesian join of dts
mutate(dtj = map(dt, ~.[., allow.cartesian = TRUE])) %>%
# compute between group correlation
mutate(cors = map(dtj, ~.[, list(cors = cor(score, i.score)), by = list(group, i.group)])) %>%
# unnest correlation object
unnest(cors) %>%
# formatting for display
select(type, time, group1 = group, group2 = i.group, correlation = cors) %>%
filter(group1 != group2) %>%
arrange(time, group1, group2) %>%
# now use df2 since currently we have G1G2, and G2G1
# which are both equal so remove G2G1
left_join(df2, by = c("group1", "group2")) %>%
filter(value) %>%
select(-value)
# A tibble: 12 x 5
type time group1 group2 correlation
<int> <int> <int> <int> <dbl>
1 1 100 1 2 0.121
2 1 100 1 3 0.0543
3 1 100 1 4 -0.0694
4 1 100 2 3 -0.104
5 1 100 2 4 -0.0479
6 1 100 3 4 -0.0365
7 2 101 1 2 -0.181
8 2 101 1 3 -0.0673
9 2 101 1 4 0.00765
10 2 101 2 3 0.0904
11 2 101 2 4 -0.0126
12 2 101 3 4 -0.154
Here is an alternative approach which creates all unique combinations of TIME, TYPE, and duplicated GROUPs through a cross join and then computes the correlation of SCORE for the correspondings subsets of DATA:
library(data.table) # development version 1.14.3 required
setDT(DATA, key = c("GROUP", "TYPE", "TIME"))[
, CJ(time = TIME, type = TYPE, groupA = GROUP, groupB = GROUP, unique = TRUE)][
groupA < groupB][
, corType := paste0("G", groupA, "G", groupB)][][
, corValue := cor(DATA[.(groupA, type, time), SCORE],
DATA[.(groupB, type, time), SCORE]),
by = .I][]
time type groupA groupB corType corValue
1: 100 1 1 2 G1G2 0.11523940
2: 100 1 1 3 G1G3 -0.05124326
3: 100 1 1 4 G1G4 -0.16943203
4: 100 1 2 3 G2G3 0.05475435
5: 100 1 2 4 G2G4 -0.10769738
6: 100 1 3 4 G3G4 0.01464146
7: 100 2 1 2 G1G2 NA
8: 100 2 1 3 G1G3 NA
9: 100 2 1 4 G1G4 NA
10: 100 2 2 3 G2G3 NA
11: 100 2 2 4 G2G4 NA
12: 100 2 3 4 G3G4 NA
13: 101 1 1 2 G1G2 NA
14: 101 1 1 3 G1G3 NA
15: 101 1 1 4 G1G4 NA
16: 101 1 2 3 G2G3 NA
17: 101 1 2 4 G2G4 NA
18: 101 1 3 4 G3G4 NA
19: 101 2 1 2 G1G2 -0.04997479
20: 101 2 1 3 G1G3 -0.02262932
21: 101 2 1 4 G1G4 -0.00331578
22: 101 2 2 3 G2G3 -0.01243952
23: 101 2 2 4 G2G4 0.16683223
24: 101 2 3 4 G3G4 -0.10556083
time type groupA groupB corType corValue
Explanation
DATA is coerced to class data.table while setting a key on columns GROUP, TYPE, and TIME. Keying is required for fast subsetting later.
The cross join CJ() creates all unique combinations of columns TIME, TYPE, GROUP, and GROUP (twice). The columns of the cross join have been renamed to avoid name clashes later on.
[groupA < groupB] ensures that equivalent combinations of groupA and groupB only appear once, e.g., G2G1 is dropped in favour of G1G2. So, this is kind of data.table version of t(combn(unique(DATA$GROUP), 2)).
A new column corType is append by reference.
Finally, the groupwise correlations are computed by stepping rowwise through the cross join table (using by = .I) and subsetting DATA by groupA, type, time and groupB, type, time, resp., using fast subsetting through keys. Please, see the vignette Keys and fast binary search based subset for more details.
Note that by = .I is a new feature of data.table development version 1.14.3.
Combinations of time, type, and group which do not exist in DATA will appear in the result set but are marked by NA in column corValue.
Data
set.seed(42) # required for reproducible data
DATA = data.frame("GROUP" = sort(rep(1:4, 200)),
"TYPE" = rep(1:2, 400),
"TIME" = rep(100:101, 400),
"SCORE" = sample(1:100, r=T, 800))
I have the following data frame in R:
df <- data.frame(name = c('p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end'),
time = c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31),
target = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2),
comb = c(0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1))
And another data frame:
data <- data.frame(time = c(2,5,8,14,14,20,21,26,28,28),
name = c('a','b','c','d','e','f','g','h','i','j'))
So, if we take a look at df we could sort the data by target and combination and we will notice that there are basically "groups". For example for target=1 and comb=0 there are four entries p1_start,p1_end,p2_start,p2_end and it is the same for all other target/comb combinations.
On the other side data contains entries with time being a timestamp.
Goal: I want to map the values from both data frames based on time.
Example: The first entry of data has time=2 meaning it happened between p1_start,p1_end so it should get the values target=1 and comb=0 mapped to the data data frame.
Example 2: The entries of data with time=14 happened between p2_start,p2_end so they should get the values target=1 and comb=1 mapped to the data data frame.
Idea: I thought I iterate over df by target and comb and for each combination of them check if there are rows in data whose time is between. The second could be done with the following command:
data[which(data$time > p1_start & data$time < p2_end),]
once I get the rows it is easy to append the values.
Problem: how could I do the iteration? I tried with the following:
df %>%
group_by(target, comb) %>%
print(data[which(data$time > df$p1_start & data$time < df$p2_end),])
But I am getting an error that time has not been initialized
Your problem is best known as performing non-equi join. We need to find a range in some given dataframe that corresponds to each value in one or more given vectors. This is better handled by the data.table package.
We would first transform your df into a format suitable for performing the join and then join data with df by time <= end while time >= start. Here is the code
library(data.table)
setDT(df)[, c("type", "name") := tstrsplit(name, "_", fixed = TRUE)]
df <- dcast(df, ... ~ name, value.var = "time")
cols <- c("target", "comb", "type")
setDT(data)[df, (cols) := mget(paste0("i.", cols)), on = .(time<=end, time>=start)]
After dcast, df looks like this
target comb type end start
1: 1 0 p1 3 1
2: 1 0 p2 7 5
3: 1 1 p1 11 9
4: 1 1 p2 15 13
5: 2 0 p1 19 17
6: 2 0 p2 23 21
7: 2 1 p1 27 25
8: 2 1 p2 31 29
And the output is
> data
time name target comb type
1: 2 a 1 0 p1
2: 5 b 1 0 p2
3: 8 c NA NA <NA>
4: 14 d 1 1 p2
5: 14 e 1 1 p2
6: 20 f NA NA <NA>
7: 21 g 2 0 p2
8: 26 h 2 1 p1
9: 28 i NA NA <NA>
10: 28 j NA NA <NA>
Here is a tidyverse solution:
library(tidyr)
library(dplyr)
df %>%
rename(name_df=name) %>%
mutate(x = time +1) %>%
pivot_longer(
cols = c(time, x),
names_to = "helper",
values_to = "time"
) %>%
right_join(data, by="time") %>%
select(time, name, target, comb)
time name target comb
<dbl> <chr> <dbl> <dbl>
1 2 a 1 0
2 5 b 1 0
3 8 c 1 0
4 14 d 1 1
5 14 e 1 1
6 20 f 2 0
7 21 g 2 0
8 26 h 2 1
9 28 i 2 1
10 28 j 2 1
df <- data.frame(name = c('p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end','p1_start','p1_end','p2_start','p2_end'),
time = c(1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31),
target = c(1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2),
comb = c(0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1))
data <- data.frame(time = c(2,5,8,14,14,20,21,26,28,28),
name = c('a','b','c','d','e','f','g','h','i','j'))
library(fuzzyjoin)
library(tidyverse)
tmp <- df %>%
separate(name,
into = c("p", "period"),
sep = "_",
remove = TRUE) %>%
pivot_wider(
id_cols = c(p, target, comb),
names_from = period,
values_from = time
) %>%
select(-p)
fuzzy_left_join(
x = data,
y = tmp,
by = c("time" = "start",
"time" = "end"),
match_fun = list(`>=`, `<=`))
#> time name target comb start end
#> 1 2 a 1 0 1 3
#> 2 5 b 1 0 5 7
#> 3 8 c NA NA NA NA
#> 4 14 d 1 1 13 15
#> 5 14 e 1 1 13 15
#> 6 20 f NA NA NA NA
#> 7 21 g 2 0 21 23
#> 8 26 h 2 1 25 27
#> 9 28 i NA NA NA NA
#> 10 28 j NA NA NA NA
Created on 2022-01-11 by the reprex package (v2.0.1)
I am trying to rearrange a dataset with a few thousand observations (to eventually use the drm function in package DRC), and I am tired of doing it in excel. Within a dataframe I am looking to add "start" and "end" times (up to inf) based on the intervals found in a vector within the df. This means I would have to end up adding an observation (row) where there the last "end" time is inf. For that last row (the one with inf) I ALSO need to subtract the total of "value" from an arbitrary number (in my example below this would be 50). All this grouped by two variables ("Name", and "Rep" in my example). I am hoping there is a solution using group_by, but honestly I'll be overjoyed at any solution!
I have a data set that looks like this;
# data
names<-c(rep("Luke",30), rep("Han", 30), rep("Leia", 30), rep("OB1", 30))
reps<-c(rep("A", 10), rep("B", 10), rep("C", 10))
time<-rep(seq(1:10), 4)
value<-rep(sample(0:5,10,replace=T), 4)
df<-data.frame(names, reps, time, value)
but need it to look like this;
Example of the data structure I need.
I'm at a loss. Please help!
If I have understood you correctly, we can do
library(dplyr)
df1 <- df %>%
group_by(names, reps) %>%
mutate(start = lag(time, default = 0),
end = time)
bind_rows(df1, df1 %>%
group_by(names, reps) %>%
summarise(start = last(time),
end = Inf,
value = sum(value))) %>%
select(-time) %>%
arrange(names, reps)
# names reps value start end
# <fct> <fct> <int> <dbl> <dbl>
# 1 Han A 2 0 1
# 2 Han A 2 1 2
# 3 Han A 1 2 3
# 4 Han A 1 3 4
# 5 Han A 3 4 5
# 6 Han A 2 5 6
# 7 Han A 0 6 7
# 8 Han A 2 7 8
# 9 Han A 2 8 9
#10 Han A 5 9 10
#11 Han A 20 10 Inf
#.....
We can do this in data.table shifting the 'time' while appending 'Inf' at the end of 'time' to create the end and difference of 50 from the sum of 'value' for 'value' after grouping by 'names' and 'reps'
library(data.table)
setDT(df)[, {stL <- last(time)
enL <- Inf
vL <- 50- sum(value)
.(start = c(shift(time, fill = 0), stL),
end = c(time, enL),
value = c(value, vL))}, .(names, reps)]
# names reps start end value
# 1: Luke A 0 1 0
# 2: Luke A 1 2 3
# 3: Luke A 2 3 3
# 4: Luke A 3 4 4
# 5: Luke A 4 5 0
# ---
#128: OB1 C 6 7 3
#129: OB1 C 7 8 0
#130: OB1 C 8 9 2
#131: OB1 C 9 10 5
#132: OB1 C 10 Inf 27
I want to change the format of a dataset in a certain way. Say I have a list of data indicating when and how many times participants attended couselling sessions. They could attend a maximum of three sessions any time within a twelve week period. Say their data is recorded like so
set.seed(01234)
df1 <- data.frame(id = rep(LETTERS[1:4], each = 3),
session = rep(paste0("session", 1:3), length.out = 12),
week1 = c(sort(sample(1:12, 3, replace = F)),
sort(sample(1:12, 3, replace = F)),
sort(sample(1:12, 3, replace = F)),
sort(sample(1:12, 3, replace = F))))
df1$week1[c(3,8,9,12)] <- NA # insert some NAs representing sessions that weren't attended
And the dataset looks like this
# id session week1
# 1 A session1 2
# 2 A session2 7
# 3 A session3 NA
# 4 B session1 7
# 5 B session2 8
# 6 B session3 10
# 7 C session1 1
# 8 C session2 NA
# 9 C session3 NA
# 10 D session1 6
# 11 D session2 7
# 12 D session3 NA
But I want a long dataset where each person has a row for each of the twelve weeks they could have attended, like so
df2 <- data.frame(id = rep(LETTERS[1:4], each = 12),
week2 = rep(1:12, times = 4))
So participant A's data looks like this
df2[1:12,]
# id week2
# 1 A 1
# 2 A 2
# 3 A 3
# 4 A 4
# 5 A 5
# 6 A 6
# 7 A 7
# 8 A 8
# 9 A 9
# 10 A 10
# 11 A 11
# 12 A 12
I would like to merge the two somehow so that the numbers in the week1 column of df1 are matched to their appropriate row in df2, ideally something like this (example is participant A only)
data.frame(id = rep("A", 12),
week = 1:12,
attended = c(0,1,0,0,0,0,1,0,0,0,0,0))
# id week attended
# 1 A 1 0
# 2 A 2 1
# 3 A 3 0
# 4 A 4 0
# 5 A 5 0
# 6 A 6 0
# 7 A 7 1
# 8 A 8 0
# 9 A 9 0
# 10 A 10 0
# 11 A 11 0
# 12 A 12 0
One approach utilizing a merge:
# merge the 2 dataframes
names(df2)[2] <- "week"
names(df1)[3] <- "week"
df <- merge(df2, df1, by=c("id", "week"), all.x=T)
# replace 'session' with 1s and 0s
df$session <- !is.na(df$session)
do.call(rbind, lapply(split(df2, df2$id), function(x){
x$attended = as.integer(x$week2 %in% df1$week1[df1$id == x$id[1]])
x
}))
You could expand the original data.frame using tidyr::complete so you don't need to merge, just define week1 as a factor with the correct number of levels:
library(dplyr)
library(tidyr)
df1 %>%
group_by(id) %>%
mutate(week1 = factor(week1, levels = 1:12),
session = !is.na(session)) %>%
complete(week1, fill = list(session = 0))
# A tibble: 52 x 3
# Groups: id [4]
id week1 session
<fct> <fct> <dbl>
1 A 1 0
2 A 2 1
3 A 3 0
4 A 4 0
5 A 5 0
6 A 6 0
7 A 7 1
8 A 8 0
9 A 9 0
10 A 10 0
# ... with 42 more rows
I ran into an issue with the rcppRoll package. I want to use it to sum the value of the past 3 months, however, sometimes there is no data on 1 or more months. The "n = 3" considers the last three observations, rather than the last 3 months. I couldn't find a solid solution, so I am trying my luck here. Thank you in advance for any suggestions.
P.S. I prefer to work with data.table and rcpp_roll as my dataset is large and I am familiar with those.
Code:
library("data.table")
library("RcppRoll")
test = data.table(id = rep(1, 8),date = c("2015-01","2015-02","2015-03","2015-04","2015-08","2015-09","2015-10","2015-11"), value = 1:8)
test = test[, var:= roll_sumr(value, n = 3, na.rm = TRUE), by = id]
id date value var
1: 1 2015-01 1 NA
2: 1 2015-02 2 NA
3: 1 2015-03 3 6
4: 1 2015-04 4 9
5: 1 2015-08 5 12
6: 1 2015-09 6 15
7: 1 2015-10 7 18
8: 1 2015-11 8 21
Expected output
prefered_outcome = data.table(id = rep(1, 8),date = c("2015-01","2015-02","2015-03","2015-04","2015-08","2015-09","2015-10","2015-11"), value = 1:8,var = c(NA, NA, 6, 9, NA, NA, 18, 21))
id date value var
1: 1 2015-01 1 NA
2: 1 2015-02 2 NA
3: 1 2015-03 3 6
4: 1 2015-04 4 9
5: 1 2015-08 5 NA
6: 1 2015-09 6 NA
7: 1 2015-10 7 18
8: 1 2015-11 8 21
Define ym of yearmon class and check if the prior and second prior ym are one and two months back and if so use roll_sumr and otherwise use NA.
library(zoo)
ym <- test[, as.yearmon(date)]
test[, roll := ifelse(ym - 1/12 == shift(ym) & ym - 2/12 == shift(ym, 2),
roll_sumr(value, 3, na.rm = TRUE), NA), by = id ]
giving:
> test
id date value roll
1: 1 2015-01 1 NA
2: 1 2015-02 2 NA
3: 1 2015-03 3 6
4: 1 2015-04 4 9
5: 1 2015-08 5 NA
6: 1 2015-09 6 NA
7: 1 2015-10 7 18
8: 1 2015-11 8 21
You can add the missing months first and then performing the function. After that, the added months can be removed again
library(data.table)
library("RcppRoll")
library(zoo)
test = data.table(id = rep(1, 8),date = c("2015-01","2015-02","2015-03","2015-04","2015-08","2015-09","2015-10","2015-11"), value = 1:8)
test$date <- as.yearmon(test$date)
allMonths <- seq.Date(from=as.Date(test$date[1]),to=as.Date(test$date[length(test$date)]),by="month")
df2 <- data.frame(date=as.yearmon(allMonths))
df3 <- merge(test,df2, all=TRUE)
df3 <- df3[, var:= roll_sumr(value, n = 3, na.rm = TRUE), by = id]
df3