Consider the following sample data. The data has 2 individuals per group and each individual has 2 entries.
rm(list=ls()); set.seed(1234)
G=4 ; # Suppose you have 4 groups
nTot<-8 # We have 2 individuals per group so in total we have 8 individuals
group<-rep(1:G, rep(4,G) )#Group identifier
individualID<-rep(1:nTot, rep(2,nTot) )#We have 2 individuals per group each with 2 entries
n<-2*nTot # We have 16 entries in total
X<-rbinom(n, 1, 0.5)
Y<-runif(n, 0, 1)
Z<-runif(n, 0, 4)
df1<-round(data.frame(group,individualID,X,Y,Z),3)
> df1
group individualID X Y Z
1 1 1 0 0.286 1.219
2 1 1 1 0.267 2.029
3 1 2 1 0.187 0.724
4 1 2 1 0.232 3.039
5 2 3 1 0.317 0.805
6 2 3 1 0.303 1.035
7 2 4 0 0.159 3.969
8 2 4 0 0.040 3.229
9 3 5 1 0.219 2.213
10 3 5 1 0.811 2.586
11 3 6 1 0.526 1.247
12 3 6 1 0.915 2.487
13 4 7 0 0.831 1.319
14 4 7 1 0.046 2.008
15 4 8 0 0.456 2.708
16 4 8 1 0.265 1.940
Func<-X*Y+Z
Func
The code below computes sum of Func per group using split() function.
Func<-X*Y+Z
GroupSum<-as.numeric( sapply( split(Func,group),sum) ) # Group sum of X*Y+Z
I would like a code that will split the data and group sum Func only for the first entry per individual i.e I should end up with a vector of 4 values as we have 4 groups.
We may use a group by approach i.e. grouped by 'group', slice the first row, ungroup, and then summarise to get the sum of X multiplied by 'Y' and added to 'Z'
library(dplyr)
df1 %>%
group_by(group) %>%
slice_head(n = 1) %>%
summarise(out = sum(X * Y + Z, na.rm = TRUE))
-output
# A tibble: 4 × 2
group out
<dbl> <dbl>
1 1 2.19
2 2 1.31
3 3 1.50
4 4 2.52
Or can use duplicated in base R
aggregate(out ~ group, transform(subset(df1, !duplicated(group)),
out = X * Y + Z), FUN = sum)
group out
1 1 2.194
2 2 1.311
3 3 1.501
4 4 2.522
Related
I have a dataframe grouped by grp:
df <- data.frame(
v = rnorm(25),
grp = c(rep("A",10), rep("B",15)),
size = 2)
I want to flag the run-length of intervals determined by size. For example, for grp == "A", size is 2, and the number of rows is 10. So the interval should have length 10/2 = 5. This code, however, creates intervals with length 2:
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% size)
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 1
4 -0.913 A 2 1
5 0.486 A 2 2
6 -1.80 A 2 2
7 -0.370 A 2 3
8 -0.209 A 2 3
9 -0.661 A 2 4
10 -0.177 A 2 4
# … with 15 more rows
How can I flag the correct run-length of the size-determined intervals? The desired output is this:
# A tibble: 25 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <dbl>
1 -0.166 A 2 0
2 -1.12 A 2 0
3 0.941 A 2 0
4 -0.913 A 2 0
5 0.486 A 2 0
6 -1.80 A 2 1
7 -0.370 A 2 1
8 -0.209 A 2 1
9 -0.661 A 2 1
10 -0.177 A 2 1
# … with 15 more rows
If I interpreted your question correctly, this small change should do the trick?
df %>%
group_by(grp) %>%
mutate(
interval = (row_number() -1) %/% (n()/size))
You can use gl:
df %>%
group_by(grp) %>%
mutate(interval = gl(first(size), ceiling(n() / first(size)))[1:n()])
output
# A tibble: 26 × 4
# Groups: grp [2]
v grp size interval
<dbl> <chr> <dbl> <fct>
1 -1.12 A 2 1
2 3.04 A 2 1
3 0.235 A 2 1
4 -0.0333 A 2 1
5 -2.73 A 2 1
6 -0.0998 A 2 1
7 0.976 A 2 2
8 0.414 A 2 2
9 0.912 A 2 2
10 1.98 A 2 2
11 1.17 A 2 2
12 -0.509 B 2 1
13 0.704 B 2 1
14 -0.198 B 2 1
15 -0.538 B 2 1
16 -2.86 B 2 1
17 -0.790 B 2 1
18 0.488 B 2 1
19 2.17 B 2 1
20 0.501 B 2 2
21 0.620 B 2 2
22 -0.966 B 2 2
23 0.163 B 2 2
24 -2.08 B 2 2
25 0.485 B 2 2
26 0.697 B 2 2
I want to perform matching between two groups in a data frame, where all rows belonging to one group (binary) are matched with observations from the other group (with replacement) if their difference on another column is smaller than a pre-set threshold. Let's use the toy-dataset below:
set.seed(123)
df <- data.frame(id = c(1:10),
group = rbinom(10,1, 0.3),
value = round(runif(10),2))
threshold <- round(sd(df$value),2)
Which looks like this
> df
id group value
1 1 0 0.96
2 2 1 0.45
3 3 0 0.68
4 4 1 0.57
5 5 1 0.10
6 6 0 0.90
7 7 0 0.25
8 8 1 0.04
9 9 0 0.33
10 10 0 0.95
> threshold
[1] 0.35
In this case, I want to match rows with group==1 with rows with group==2 where the difference between value is smaller than threshold(0.35). This should lead to a data frame looking like this (apologizes for potential error, did it manually).
id matched_id
1 2 3
2 2 7
3 2 9
4 4 3
5 4 6
6 4 7
7 4 9
8 5 7
9 5 9
10 8 7
11 8 9
Thank you!
You can use df |> left_join(df, by = character()) which is the tidyverse way of performing a cartesian product. Then filter according to threshold.
library(dplyr)
df |>
left_join(df, by = character()) |>
filter(group.x != group.y,
id.x < id.y,
abs(value.x - value.y) < threshold)
#>+ id.x group.x value.x id.y group.y value.y
#>1 2 1 0.45 3 0 0.68
#>2 2 1 0.45 7 0 0.25
#>3 2 1 0.45 9 0 0.33
#>4 3 0 0.68 4 1 0.57
#>5 4 1 0.57 6 0 0.90
#>6 4 1 0.57 7 0 0.25
#>7 4 1 0.57 9 0 0.33
#>8 5 1 0.10 7 0 0.25
#>9 5 1 0.10 9 0 0.33
#>10 7 0 0.25 8 1 0.04
#>11 8 1 0.04 9 0 0.33
UPDATED ANSWER: Was going slow on a larger dataset, so I tried to make the code a bit more efficient.
Came up with a solution that seems to do what I want. Not sure how efficient this code is on larger data but seems to work.
library(tidyverse)
library(data.table)
# All values
dist_mat <- df$value
# Adding identifier
names(dist_mat) <- df$id
# Dropping combinations that are not of interest
dist_mat_col <-dist_mat[df$group == 0]
dist_mat_row <- dist_mat[df$group == 1]
# Difference between each value
dist_mat <- abs(outer(dist_mat_row, dist_mat_col, "-"))
# Identifying matches that fulfills the criteria
dist_mat <- dist_mat <= threshold
# From matrix to a long dataframe
dist_mat <- melt(dist_mat)
# Tidying up the dataframe and dropping unneccecary columns and rows.
dist_mat <- dist_mat %>%
rename(id = Var1,
matched_id = Var2,
cond = value) %>%
filter(cond == TRUE) %>%
left_join(df, by = "id") %>%
select(id, matched_id)
This leads to the following dataframe:
> arrange(dist_mat, id)
id matched_id
1 2 3
2 2 7
3 2 9
4 4 3
5 4 6
6 4 7
7 4 9
8 5 7
9 5 9
10 8 7
11 8 9
So let's say I have two data frames
df1 <- data.frame(n = rep(n = 2,c(0,1,2,3,4)), nn =c(rep(x = 1, 5), rep(x=2, 5)),
y = rnorm(10), z = rnorm(10))
df2 <- data.frame(x = rnorm(20))
Here is the first df:
> head(df1)
n nn y z
1 0 1 1.5683647 0.48934096
2 1 1 1.2967556 -0.77891030
3 2 1 -0.2375963 1.74355935
4 3 1 -1.2241501 -0.07838729
5 4 1 -0.3278127 -0.97555379
6 0 2 -2.4124503 0.07065982
Here is the second df:
x
1 -0.4884289
2 0.9362939
3 -1.0624084
4 -0.9838209
5 0.4242479
6 -0.4513135
I'd like to substact x column values of df2 from z column values of df1. And return the rows of both dataframes for which the substracted value is approximately equal to that of y value of df1.
Is there a way to construct such function, so that I could imply the approximation to which the values should be equal?
So, that it's clear, I'd like to substract all x values from all z values and then compare the value to y column value of df1, and check if there is approximately matching value to y.
Here's an approach where I match every row of df1 with every row of df2, then take x and y from z (as implied by your logic of comparing z-x to y; this is the same as comparing z-x-y to zero). Finally, I look at each row of df1 and keep the match with the lowest absolute difference.
library(dplyr)
left_join(
df1 %>% mutate(dummy = 1, row = row_number()),
df2 %>% mutate(dummy = 1, row = row_number()), by = "dummy") %>%
mutate(diff = z - x - y) %>%
group_by(row.x) %>%
slice_min(abs(diff)) %>%
ungroup()
Result (I used set.seed(42) before generating df1+df2.)
# A tibble: 10 x 9
n nn y z dummy row.x x row.y diff
<dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <int> <dbl>
1 0 1 1.37 1.30 1 1 0.0361 20 -0.102
2 1 1 -0.565 2.29 1 2 1.90 5 0.956
3 2 1 0.363 -1.39 1 3 -1.76 8 0.0112
4 3 1 0.633 -0.279 1 4 -0.851 18 -0.0607
5 4 1 0.404 -0.133 1 5 -0.609 14 0.0713
6 0 2 -0.106 0.636 1 6 0.705 12 0.0372
7 1 2 1.51 -0.284 1 7 -1.78 2 -0.0145
8 2 2 -0.0947 -2.66 1 8 -2.41 19 -0.148
9 3 2 2.02 -2.44 1 9 -2.41 19 -2.04
10 4 2 -0.0627 1.32 1 10 1.21 4 0.168
I observe 12 responses of 2 survey participants.
data = data.frame(id = c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2), response = c(2,2,3,3,6,3,6,7,3,1,4,3,3,3,6,4,2,6,7,3,2,1,5,6))
data
id response
1 1 2
2 1 2
3 1 3
4 1 3
5 1 6
6 1 3
7 1 6
8 1 7
9 1 3
10 1 1
11 1 4
12 1 3
13 2 3
14 2 3
15 2 6
16 2 4
17 2 2
18 2 6
19 2 7
20 2 3
21 2 2
22 2 1
23 2 5
24 2 6
Now I want to add 2 things to the data of each survey participant:
a) The most frequent value of this survey participant
b) the relative frequency of the most frequent value
How can I add these things using dplyr:
data %>%
group_by(id) %>%
mutate(most_frequent_value = ?,
relative_frequency_of_most_frequent_value = ?)
I'd probably use a two step solution. First, create a data.frame of frequency/relative frequency. Then join to it. We use slice(which.max()), because it will return one row. Using slice_max may return multiple rows.
library(tidyverse)
# count by id, response, calculate rel frequency
# rename columns to make inner_join easier
freq_table <- dd %>%
count(id, response) %>%
group_by(id) %>%
mutate(rel_freq = n / sum(n)) %>%
select(id, most_frequent_response = response, rel_freq)
# inner join to sliced freq_table (grouping by id is preserved)
dd %>%
inner_join(freq_table %>% slice(which.max(rel_freq)))
# id response most_frequent_response rel_freq
# 1 1 2 3 0.4166667
# 2 1 2 3 0.4166667
# 3 1 3 3 0.4166667
# 4 1 3 3 0.4166667
# 5 1 6 3 0.4166667
# ...
You could try:
table(data$id, data$response) %>%
as.data.frame() %>%
setNames(c("id", "response", "n")) %>%
group_by(id) %>%
slice_max(n, 1) %>%
group_by(response) %>%
filter(n() > 1) %>%
mutate(ratio = c(n[1]/sum(n), n[2]/sum(n)))
#> # A tibble: 2 x 4
#> # Groups: response [1]
#> id response n ratio
#> <fct> <fct> <int> <dbl>
#> 1 1 3 5 0.625
#> 2 2 3 3 0.375
Does this work:
data %>% group_by(id, response) %>% mutate(n = n()) %>%
ungroup() %>% group_by(id) %>%
mutate(most_frequent_value = response[n == max(n)][1],
relative_frequency_of_most_frequent_value = max(n)/n())
# A tibble: 24 x 5
# Groups: id [2]
id response n most_frequent_value relative_frequency_of_most_frequent_value
<dbl> <dbl> <int> <dbl> <dbl>
1 1 2 2 3 0.417
2 1 2 2 3 0.417
3 1 3 5 3 0.417
4 1 3 5 3 0.417
5 1 6 2 3 0.417
6 1 3 5 3 0.417
7 1 6 2 3 0.417
8 1 7 1 3 0.417
9 1 3 5 3 0.417
10 1 1 1 3 0.417
11 1 4 1 3 0.417
12 1 3 5 3 0.417
13 2 3 3 3 0.25
14 2 3 3 3 0.25
15 2 6 3 3 0.25
16 2 4 1 3 0.25
17 2 2 2 3 0.25
18 2 6 3 3 0.25
19 2 7 1 3 0.25
20 2 3 3 3 0.25
21 2 2 2 3 0.25
22 2 1 1 3 0.25
23 2 5 1 3 0.25
24 2 6 3 3 0.25
>
I am trying to classify the temp variable into different classes in such a way that Dur>5.
Further, I want to find the maximum value for each group as shown in expected outcome.
Dur=c(2.75,0.25,13,0.25,45.25,0.25,0.25,4.25,0.25,0.25,14)
temp=c(2.54,5.08,0,2.54,0,5,2.54,0,2.54,0,2.54)
df=data.frame(Dur,temp)
Expected Outcome:
group=c(1,1,1,2,2,3,3,3,3,3,3)
Colnew=c(5.08,5.08,5.08,2.54,2.54,5,5,5,5,5,5)
(output=data.frame(df,group,Colnew))
We create a grouping variable by taking the cumsum of logical vector, then get the max of 'temp'
library(dplyr)
df %>%
group_by(group = as.integer(factor(lag(cumsum(Dur > 5), default = 0)))) %>%
mutate(Max = max(temp))
# A tibble: 11 x 4
# Groups: group [3]
# Dur temp group Max
# <dbl> <dbl> <int> <dbl>
# 1 2.75 2.54 1 5.08
# 2 0.25 5.08 1 5.08
# 3 13 0 1 5.08
# 4 0.25 2.54 2 2.54
# 5 45.2 0 2 2.54
# 6 0.25 5 3 5
# 7 0.25 2.54 3 5
# 8 4.25 0 3 5
# 9 0.25 2.54 3 5
#10 0.25 0 3 5
#11 14 2.54 3 5