Related
I have the following data frame that contains points that originate from different samples. Each point has a type.
I need to calculate, for each point belonging to a given sample of a given type (for instance for "Sample_1" type "A") how many points of another type are around it in a given cutoff.
My current implementation uses "future.apply" and I was wondering if there is a more efficient way to solve this problem. The example here is limited and should run quickly, the real problem is composed of several thousands of lines and it's much slower.
In the end I store the results in a list.
This list has, for each element with "type" in "starting_point", the number of elements of type "target_point" in a threshold of 40.
library(future)
library(future.apply)
a_test=data.frame(ID=sample(c("Sample_1", "Sample_2", "Sample_3"), 100, replace=TRUE), type=sample(c("A", "B", "C", "D"), 100, replace = TRUE), xpos=sample(1:200, 100, replace=TRUE), ypos=sample(1:200, 100, replace=TRUE))
starting_point=c("A", "B")
target_point=c("C", "D")
threshold=40
result_per_pair=list()
for(sp in starting_point){
## Here I select a data frame of "Starting points" without looking
## from which ID they came from
sp_tdf=a_test[a_test$type==sp, ]
for(tp in target_point){
## Here I select a data frame of "Target points" without looking
## from which ID they came from
tp_tdf=a_test[a_test$type==tp, ]
## I use future_sapply here, parallelizing on each line of "sp_tdf"
plan(multisession)
elements_around=future_sapply(1:nrow(sp_tdf), function(x, sp_tdf, tp_tdf, treshold2){
xc=sp_tdf$xpos[x]
yc=sp_tdf$ypos[x]
### NOTE HERE: At this point I select the points that are in the same
### ID as the current line of sp_tdf
tp_tdf2=tp_tdf[tp_tdf$ID == sp_tdf$ID[x],]
ares=tp_tdf2[ (tp_tdf2$xpos-xc)^2 + (tp_tdf2$ypos-yc)^2 <threshold2, ]
return(nrow(ares))
},sp_tdf=sp_tdf, tp_tdf=tp_tdf, threshold2=threshold*threshold)
a_newcol=paste0(tp, "_around_", sp)
## we need to create a copy of sp_tdf otherwise we add columns to the
## initial sp_tdf and we memorize them in the wrong place in the list
sp_tdf_temp=sp_tdf
sp_tdf_temp[, a_newcol]=elements_around
result_per_pair[[ paste0(tp, "_around_", sp ) ]]=rbind(result_per_pair[[ paste0(tp, "_around_", sp ) ]], sp_tdf_temp)
}
}
You can see the type of table I get here:
head(result_per_pair[[1]])
$C_around_A
ID type xpos ypos C_around_A
1 Sample_2 A 26 74 1
2 Sample_3 A 64 8 1
3 Sample_3 A 121 2 1
5 Sample_2 A 62 94 0
You can try using RANN::nn2 function:
id_list <- split(a_test, a_test$ID)
res <- id_list %>%
map(~select(.x, xpos, ypos)) %>%
map(~RANN::nn2(.x, .x, k = nrow(.), searchtype = "radius", radius = threshold)) %>%
map(1) %>%
map2(
id_list,
function(x, y){
seq_len(nrow(x)) %>%
map(~x[.x,] %>% .[. > 0]) %>%
map(~y[.x,]) %>%
map("type") %>%
map_dfr(table) %>%
mutate(across(everything(), as.integer))
}
) %>%
map2_dfr(id_list, ~bind_cols(.y, .x))
Some time improvements might be done replacing tidyverse functions (hard to say how fast it is on your example). Result:
res %>% head()
ID type xpos ypos A B C D
Sample_1 C 48 157 0 0 3 1
Sample_1 D 177 97 1 1 1 3
Sample_1 C 10 71 0 0 3 0
Sample_1 C 71 168 1 1 2 0
Sample_1 D 82 48 1 0 1 2
Sample_1 C 165 71 3 3 1 1
where columns A-D represent count of types within same ID. I was using seed 123 for generating a_test. You can adapt algorithm to work with starting_point and target_point with spliting each id_list into two parts - those defined by starting_point and target_point and adapting data & query arguments in RANN::nn2.
edit
Function based on the ideas of upper comment:
f <- function(df, threshold, start = levels(df$type), target = levels(df$type)){
my_lists <- df %>%
filter(type %in% c(start, target)) %>%
split(.$ID) %>%
map(
function(x){
map(
list(start, target),
~filter(x, type %in% .x) %>% mutate(type = droplevels(type))
)
}
) %>%
discard(~any(map_int(.x, nrow) == 0))
indices <- my_lists %>%
map(
~RANN::nn2(
data = select(.x[[2]], xpos, ypos),
query = select(.x[[1]], xpos, ypos),
k = nrow(.x[[2]]),
searchtype = "radius",
radius = threshold
)
) %>%
map(1) %>%
map(function(x) seq_len(nrow(x)) %>% map(~x[.x,] %>% .[. > 0]))
my_lists %>%
map(2) %>%
map2(indices, function(x, y) map_dfr(y, ~summary(x[.x,]$type))) %>%
{map2_dfr(map(my_lists, 1), ., bind_cols)}
}
To count C around A with radius 40:
f(a_test, 40, "A", "C")
df <- data.frame('Dev' = 1:12,
'GWP' = seq(10,120,10),
'2012' = 1:12,
'Inc' = seq(10,120,10),
'GWP2' = c(seq(10,100,10),NA,NA),
'2013'= 1:12,
'Inc2' = c(seq(10,100,10),NA,NA),
'GWP3' = c(seq(10,80,10),NA,NA,NA,NA),
'2014'= 1:12,
'Inc3' = c(seq(10,80,10),NA,NA,NA,NA))
head(df)
result_df <- data.frame('Dev' = rep(1:12,3),
'GWP' = c(seq(10,120,10),
c(seq(10,100,10),NA,NA),
c(seq(10,80,10),NA,NA,NA,NA)),
'YEAR' = c(rep(2012,12),
rep(2013,12),
rep(2014,12)),
'Inc' = c(seq(10,120,10),
c(seq(10,100,10),NA,NA),
c(seq(10,80,10),NA,NA,NA,NA)))
head(result_df)
The above is my data structure.
I'm trying to make the df to look like result_df. I'm assuming using the library reshape2 somehow would do the trick but I'm having troubles getting it to come out as expected:
x <- melt(df,id=c("Dev"))
x$value <- ifelse(x$variable == 'X2012',2012,
ifelse(x$variable == 'X2013',2013,
ifelse(x$variable == 'X2014',2014,x$value)))
x$variable <- ifelse(x$variable %in% c('GWP','GWP2','GWP3'),'GWP',
ifelse(x$variable %in% c('Inc','Inc2','Inc3'), 'Inc',
ifelse(x$variable %in% c('X2012','X2013','X2014'),"Year",
x$variable)))
The problem is that the "year" column in my actual data can go for 20-30 years and I want to avoid using multiple ifelse statements to map them up. Is there a way to do this?
The data needs some pre-processing before getting the expected output. Using tidyverse one possible way is
library(tidyverse)
df %>%
gather(key, value, -Dev) %>%
mutate(col = case_when(str_detect(key, "^GWP") ~ "GWP",
str_detect(key, "^X") ~ "Year",
str_detect(key, "^Inc") ~ "Inc"),
value = ifelse(col == "Year", sub("^X", "", key), value)) %>%
select(-key) %>%
group_by(col) %>%
mutate(Dev1 = row_number()) %>%
spread(col, value) %>%
select(-Dev1)
# A tibble: 36 x 4
# Dev GWP Inc Year
# <int> <chr> <chr> <chr>
# 1 1 10 10 2012
# 2 1 10 10 2013
# 3 1 10 10 2014
# 4 2 20 20 2012
# 5 2 20 20 2013
# 6 2 20 20 2014
# 7 3 30 30 2012
# 8 3 30 30 2013
# 9 3 30 30 2014
#10 4 40 40 2012
# … with 26 more rows
I found that this works for the first part:
apply(matrix(c(2012:2014)), 1, function(y) x$value[x$variable == paste("X", y, sep = "")] <<- y )
create a 1 dim matrix to iterate over using apply.
create a function to replace the values found through masking.
Note the use of the <<-, it assigns the respective values to the x scoped one level above that of the function defined in the apply.
Note it applies the function to the variable x and returns the values used in the replacement.
For the second part:
x$variable[x$variable %in% c('GWP', 'GWP2', 'GWP3')] <- "GWP"
x$variable[x$variable %in% c('Inc', 'Inc2', 'Inc3')] <- "Inc"
Since the variable column is type factor and Year is not a level:
x <- transform(x, variable = as.character(variable))
x$variable[x$variable %in% c('X2012', 'X2013', 'X2014')] <- "Year"
x <- transform(x, variable = as.factor(variable))
I have data for which I want to summarize group means. I then would like to re-group some of the smaller groups (matching a certain n < x condition) into a group called "others". I found a way to do this. But it feels like there are more efficient solutions out there. I wonder how a data.table approach would solve the problem.
Here is an example using tibble and dyplr.
# preps
library(tibble)
library(dplyr)
set.seed(7)
# generate 4 groups with more observations
tbl_1 <- tibble(group = rep(sample(letters[1:4], 150, TRUE), each = 4),
score = sample(0:10, size = 600, replace = TRUE))
# generate 3 groups with less observations
tbl_2 <- tibble(group = rep(sample(letters[5:7], 50, TRUE), each = 3),
score = sample(0:10, size = 150, replace = TRUE))
# put them into one data frame
tbl <- rbind(tbl_1, tbl_2)
# aggregate the mean scores and count the observations for each group
tbl_agg1 <- tbl %>%
group_by(group) %>%
summarize(MeanScore = mean(score),
n = n())
So far so easy.
Next I want to only show groups with more than 100 observations. All other groups should be merged into one group called "others".
# First, calculate summary stats for groups less then n < 100
tbl_agg2 <- tbl_agg1 %>%
filter(n<100) %>%
summarize(MeanScore = weighted.mean(MeanScore, n),
sumN = sum(n))
Note: There was a mistake in the calculation above which is now corrected (#Frank: thanks for spotting it!)
# Second, delete groups less then n < 100 from the aggregate table and add a row containing the summary statistics calculated above instead
tbl_agg1 <- tbl_agg1 %>%
filter(n>100) %>%
add_row(group = "others", MeanScore = tbl_agg2[["MeanScore"]], n = tbl_agg2[["sumN"]])
tbl_agg1 basically shows what I want it to show, but I wonder if there is a smoother, more efficient way to do this. At the same time I wonder how a data.table approach would deal with the problem at hand.
I welcome any suggestions.
Your calculation for the "other" group is wrong, I guess... should be...
tbl_agg1 %>% {bind_rows(
filter(., n>100),
filter(., n<100) %>%
summarize(group = "other", MeanScore = weighted.mean(MeanScore, n), n = sum(n))
)}
However, you could keep things a lot simpler from the start by using a different grouping variable:
tbl %>%
group_by(group) %>%
group_by(g = replace(group, n() < 100, "other")) %>%
summarise(n = n(), m = mean(score))
# A tibble: 5 x 3
g n m
<chr> <int> <dbl>
1 a 136 4.79
2 b 188 4.49
3 c 160 5.32
4 d 116 4.78
5 other 150 5.42
Or with data.table
library(data.table)
DT = data.table(tbl)
DT[, n := .N, by=group]
DT[, .(.N, m = mean(score)), keyby=.(g = replace(group, n < 100, "other"))]
g N m
1: a 136 4.786765
2: b 188 4.489362
3: c 160 5.325000
4: d 116 4.784483
5: other 150 5.420000
I'm trying to apply a function over the rows of a data frame and return a value based on the value of each element in a column. I'd prefer to pass the whole dataframe instead of naming each variable as the actual code has many variables - this is a simple example.
I've tried purrr map_dbl and rowwise but can't get either to work. Any suggestions please?
#sample df
df <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31))
#required result
Res <- data.frame(Y=c("A","B","B","A","B"),
X=c(1,5,8,23,31),
NewVal=c(10,500,800,230,3100)
)
#use mutate and map or rowwise etc
Res <- df %>%
mutate(NewVal=map_dbl(.x=.,.f=FnAdd(.)))
Res <- df %>%
rowwise() %>%
mutate(NewVal=FnAdd(.))
#sample fn
FnAdd <- function(Data){
if(Data$Y=="A"){
X=Data$X*10
}
if(Data$Y=="B"){
X=Data$X*100
}
return(X)
}
If there are multiple values, it is better to have a key/val dataset, join and then do the mulitiplication
keyVal <- data.frame(Y = c("A", "B"), NewVal = c(10, 100))
df %>%
left_join(keyVal) %>%
mutate(NewVal = X*NewVal)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100
It is not clear how many unique values are there in the actual dataset 'Y' column. If we have only a few values, then case_when can be used
FnAdd <- function(Data){
Data %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X *100,
TRUE ~ X))
}
FnAdd(df)
# Y X NewVal
#1 A 1 10
#2 B 5 500
#3 B 8 800
#4 A 23 230
#5 B 31 3100
You were originally looking for a solution using dplyr's rowwise() function, so here is that solution. The nice thing about this approach is that you don't need to create a separate function.
Here's the version using if()
df %>%
rowwise() %>%
mutate(NewVal = ifelse(Y == "A", X * 10,
ifelse(Y == "B", X * 100)))
and here's the version using case_when:
df %>%
rowwise() %>%
mutate(NewVal = case_when(Y == "A" ~ X * 10,
Y == "B" ~ X * 100))
Starting with data containing multiple observations for each group, like this:
set.seed(1)
my.df <- data.frame(
timepoint = rep(c(0, 1, 2), each= 3),
counts = round(rnorm(9, 50, 10), 0)
)
> my.df
timepoint counts
1 0 44
2 0 52
3 0 42
4 1 66
5 1 53
6 1 42
7 2 55
8 2 57
9 2 56
To perform a summary calculation at each timepoint relative to timepoint == 0, for each group I need to pass a vector of counts for timepoint == 0 and a vector of counts for the group (e.g. timepoint == 0) to an arbitrary function, e.g.
NonsenseFunction <- function(x, y){
(mean(x) - mean(y)) / (1 - mean(y))
}
I can get the required output from this table, either with dplyr:
library(dplyr)
my.df %>%
group_by(timepoint) %>%
mutate(rep = paste0("r", 1:n())) %>%
left_join(x = ., y = filter(., timepoint == 0), by = "rep") %>%
group_by(timepoint.x) %>%
summarise(result = NonsenseFunction(counts.x, counts.y))
or data.table:
library(data.table)
my.dt <- data.table(my.df)
my.dt[, rep := paste0("r", 1:length(counts)), by = timepoint]
merge(my.dt, my.dt[timepoint == 0], by = "rep", all = TRUE)[
, NonsenseFunction(counts.x, counts.y), by = timepoint.x]
This only works if the number of observations between groups is the same. Anyway, the observations aren't matched, so using the temporary rep variable seems hacky.
For a more general case, where I need to pass vectors of the baseline values and the group's values to an arbitrary (more complicated) function, is there an idiomatic data.table or dplyr way of doing so with a grouped operation for all groups?
Here's the straightforward data.table approach:
my.dt[, f(counts, my.dt[timepoint==0, counts]), by=timepoint]
This probably grabs my.dt[timepoint==0, counts] again and again, for each group. You could instead save that value ahead of time:
v = my.dt[timepoint==0, counts]
my.dt[, f(counts, v), by=timepoint]
... or if you don't want to add v to the environment, maybe
with(list(v = my.dt[timepoint==0, counts]),
my.dt[, f(counts, v), by=timepoint]
)
You could give the second argument to use the vector from your group of interest as a constant.
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, my.df$counts[my.df$timepoint == 0]))
Or if you want to make it beforehand:
constant = = my.df$counts[my.df$timepoint == 0]
my.df %>%
group_by(timepoint) %>%
mutate(response = NonsenseFunction(counts, constant))
You can try,
library(dplyr)
my.df %>%
mutate(new = mean(counts[timepoint == 0])) %>%
group_by(timepoint) %>%
summarise(result = NonsenseFunction(counts, new))
# A tibble: 3 × 2
# timepoint result
# <dbl> <dbl>
#1 0 0.0000000
#2 1 0.1398601
#3 2 0.2097902