Hi I have a dataset like this:
City = c(1,1,1,1,1,2,2,2,2,2,3,3,3,3)
Area=c("A","B","A","B","A","A","B","B","B","A","A","B","A","A")
Weights=c(2.4,1.9,0.51,0.7,2.2,1.5,1.86,1.66,1.09,2.57,2.4,0.9,3.4,3.7)
Tax=c(16,93,96,44,67,73,12,65,81,22,39,94,41,30)
z = data.frame(City,Area,Weights,Tax)
Which looks like this:
What I want to do is to obtain the weighted tax for each City and each area.
For eg. for row 1 above the computed value is:
2.4*16/(2.40+0.51+2.20) and so on.
I can do that using this function:
cit_data=list()
weighted_tax=function(z){
for (cit in unique(z$City)){
city_data=z[z$City==cit,]
area_new=list()
for (ar in unique(z$Area)){
area_data=city_data[city_data$Area==ar,]
area_data$area_dat_n = (area_data$Weight*area_data$Tax)/sum(area_data$Weights)
area_new=rbind(area_new,area_data)
}
cit_data=rbind(cit_data,area_new)
}
return(cit_data)
}
tax=weighted_tax(z)
Is there a easier/cleaner way to implement this? Thanks in advance.
Using dplyr :
library(dplyr)
z %>%
group_by(City, Area) %>%
mutate(Weighted_tax = Tax*Weights/sum(Weights))
Output:
# A tibble: 14 x 5
# Groups: City, Area [6]
City Area Weights Tax Weighted_tax
<dbl> <fct> <dbl> <dbl> <dbl>
1 1 A 2.4 16 7.51
2 1 B 1.9 93 68.0
3 1 A 0.51 96 9.58
4 1 B 0.7 44 11.8
5 1 A 2.2 67 28.8
6 2 A 1.5 73 26.9
7 2 B 1.86 12 4.84
8 2 B 1.66 65 23.4
9 2 B 1.09 81 19.2
10 2 A 2.57 22 13.9
11 3 A 2.4 39 9.85
12 3 B 0.9 94 94.
13 3 A 3.4 41 14.7
14 3 A 3.7 30 11.7
We also could do this in base R with by,
do.call(rbind, by(z, z[c("City", "Area")], function(x)
cbind(x, area.dat.n=with(x, Weights * Tax / sum(Weights)))))
# City Area Weights Tax area.dat.n
# 1 1 A 2.40 16 7.514677
# 3 1 A 0.51 96 9.581213
# 5 1 A 2.20 67 28.845401
# 6 2 A 1.50 73 26.904177
# 10 2 A 2.57 22 13.891892
# 11 3 A 2.40 39 9.852632
# 13 3 A 3.40 41 14.673684
# 14 3 A 3.70 30 11.684211
# 2 1 B 1.90 93 67.961538
# 4 1 B 0.70 44 11.846154
# 7 2 B 1.86 12 4.841649
# 8 2 B 1.66 65 23.405640
# 9 2 B 1.09 81 19.151844
# 12 3 B 0.90 94 94.000000
or with ave.
cbind(z,
area.dat.n=
apply(cbind(z, w=with(z, ave(Weights, City, Area, FUN=sum))), 1, function(x)
x[3] * x[4] / x[5]))
# City Area Weights Tax area.dat.n
# 1 1 1 2.40 16 7.514677
# 2 1 2 1.90 93 67.961538
# 3 1 1 0.51 96 9.581213
# 4 1 2 0.70 44 11.846154
# 5 1 1 2.20 67 28.845401
# 6 2 1 1.50 73 26.904177
# 7 2 2 1.86 12 4.841649
# 8 2 2 1.66 65 23.405640
# 9 2 2 1.09 81 19.151844
# 10 2 1 2.57 22 13.891892
# 11 3 1 2.40 39 9.852632
# 12 3 2 0.90 94 94.000000
# 13 3 1 3.40 41 14.673684
# 14 3 1 3.70 30 11.684211
Data
z <- structure(list(City = c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3,
3), Area = structure(c(1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 1L,
1L, 2L, 1L, 1L), .Label = c("A", "B"), class = "factor"), Weights = c(2.4,
1.9, 0.51, 0.7, 2.2, 1.5, 1.86, 1.66, 1.09, 2.57, 2.4, 0.9, 3.4,
3.7), Tax = c(16, 93, 96, 44, 67, 73, 12, 65, 81, 22, 39, 94,
41, 30)), class = "data.frame", row.names = c(NA, -14L))
Related
I need to divide columns despesatotal and despesamonetaria by the row named Total:
Lets suppose your data set is df.
# 1) Delete the last row
df <- df[-nrow(df),]
# 2) Build the desired data.frame [combining the CNAE names and the proportion columns
new.df <- cbind(grup_CNAE = df$grup_CNAE,
100*prop.table(df[,-1],margin = 2))
Finally, rename your columns. Be careful with the matrix or data.frame formats, because sometimes mathematical operations may suppose a problem. If you you use dput function in order to give us a reproducible example, the answer would be more accurate.
Here is a way to get it done. This is not the best way, but I think it is very readable.
Suppose this is your data frame:
mydf = structure(list(grup_CNAE = c("A", "B", "C", "D", "E", "Total"
), despesatotal = c(71, 93, 81, 27, 39, 311), despesamonetaria = c(7,
72, 36, 22, 73, 210)), row.names = c(NA, -6L), class = "data.frame")
mydf
# grup_CNAE despesatotal despesamonetaria
#1 A 71 7
#2 B 93 72
#3 C 81 36
#4 D 27 22
#5 E 39 73
#6 Total 311 210
To divide despesatotal values with its total value, you need to use the total value (311 in this example) as the denominator. Note that the total value is located in the last row. You can identify its position by indexing the despesatotal column and use nrow() as the index value.
mydf |> mutate(percentage1 = despesatotal/despesatotal[nrow(mydf)],
percentage2 = despesamonetaria /despesamonetaria[nrow(mydf)])
# grup_CNAE despesatotal despesamonetaria percentage1 percentage2
#1 A 71 7 0.22829582 0.03333333
#2 B 93 72 0.29903537 0.34285714
#3 C 81 36 0.26045016 0.17142857
#4 D 27 22 0.08681672 0.10476190
#5 E 39 73 0.12540193 0.34761905
#6 Total 311 210 1.00000000 1.00000000
library(tidyverse)
Sample data
# A tibble: 11 x 3
group despesatotal despesamonetaria
<chr> <int> <int>
1 1 198 586
2 2 186 525
3 3 202 563
4 4 300 562
5 5 126 545
6 6 215 529
7 7 183 524
8 8 163 597
9 9 213 592
10 10 175 530
11 Total 1961 5553
df %>%
mutate(percentage_total = despesatotal / last(despesatotal),
percentage_monetaria = despesamonetaria/ last(despesamonetaria)) %>%
slice(-nrow(.))
# A tibble: 10 x 5
group despesatotal despesamonetaria percentage_total percentage_monetaria
<chr> <int> <int> <dbl> <dbl>
1 1 198 586 0.101 0.106
2 2 186 525 0.0948 0.0945
3 3 202 563 0.103 0.101
4 4 300 562 0.153 0.101
5 5 126 545 0.0643 0.0981
6 6 215 529 0.110 0.0953
7 7 183 524 0.0933 0.0944
8 8 163 597 0.0831 0.108
9 9 213 592 0.109 0.107
10 10 175 530 0.0892 0.0954
This is a good place to use dplyr::mutate(across()) to divide all relevant columns by the Total row. Note this is not sensitive to the order of the rows and will apply the manipulation to all numeric columns. You can supply any tidyselect semantics to across() instead if needed in your case.
library(tidyverse)
# make sample data
d <- tibble(grup_CNAE = paste0("Group", 1:12),
despesatotal = sample(1e6:5e7, 12),
despesamonetaria = sample(1e6:5e7, 12)) %>%
add_row(grup_CNAE = "Total", summarize(., across(where(is.numeric), sum)))
# divide numeric columns by value in "Total" row
d %>%
mutate(across(where(is.numeric), ~./.[grup_CNAE == "Total"]))
#> # A tibble: 13 × 3
#> grup_CNAE despesatotal despesamonetaria
#> <chr> <dbl> <dbl>
#> 1 Group1 0.117 0.0204
#> 2 Group2 0.170 0.103
#> 3 Group3 0.0451 0.0837
#> 4 Group4 0.0823 0.114
#> 5 Group5 0.0170 0.0838
#> 6 Group6 0.0174 0.0612
#> 7 Group7 0.163 0.155
#> 8 Group8 0.0352 0.0816
#> 9 Group9 0.0874 0.135
#> 10 Group10 0.113 0.0877
#> 11 Group11 0.0499 0.0495
#> 12 Group12 0.104 0.0251
#> 13 Total 1 1
Created on 2022-11-08 with reprex v2.0.2
I have several data frames with monthly data, I would like to find the percentage distribution for each product and for each month. I have problem with multiple columns with months. Currently, I can only get a percentage by group for one month.
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
data_new1 <- transform(data,
perc = ave(January,
group,
FUN = prop.table))
data_new1$perc<-round(data_new1$perc, 2)
> data_new1
group Product January February perc
1 A a 12 16 0.05
2 A b 73 75 0.32
3 A c 78 11 0.34
4 A d 65 35 0.29
5 B a 86 63 0.36
6 B b 33 71 0.14
7 B c 92 49 0.38
8 B d 30 60 0.12
9 C a 91 59 0.37
10 C b 31 45 0.12
11 C c 99 7 0.40
12 C d 28 50 0.11
tidyverse
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c("January", "February"), proportions, .names = "{.col}_perc")) %>%
ungroup()
# A tibble: 12 x 6
group Product January February January_perc February_perc
<chr> <chr> <int> <int> <dbl> <dbl>
1 A a 49 40 0.426 0.252
2 A b 1 3 0.00870 0.0189
3 A c 19 50 0.165 0.314
4 A d 46 66 0.4 0.415
5 B a 61 82 0.218 0.285
6 B b 88 51 0.314 0.177
7 B c 32 75 0.114 0.260
8 B d 99 80 0.354 0.278
9 C a 6 31 0.0397 0.373
10 C b 8 5 0.0530 0.0602
11 C c 92 20 0.609 0.241
12 C d 45 27 0.298 0.325
base
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12))
tmp <- sapply(c("January", "February"), function (x) ave(data[[x]], data$group, FUN = prop.table))
colnames(tmp) <- paste0(colnames(tmp), "_perc")
res <- cbind(data, tmp)
res
#> group Product January February January_perc February_perc
#> 1 A a 42 73 0.18260870 0.238562092
#> 2 A b 67 92 0.29130435 0.300653595
#> 3 A c 58 90 0.25217391 0.294117647
#> 4 A d 63 51 0.27391304 0.166666667
#> 5 B a 48 15 0.21621622 0.081521739
#> 6 B b 16 82 0.07207207 0.445652174
#> 7 B c 80 75 0.36036036 0.407608696
#> 8 B d 78 12 0.35135135 0.065217391
#> 9 C a 81 16 0.32793522 0.117647059
#> 10 C b 83 81 0.33603239 0.595588235
#> 11 C c 11 1 0.04453441 0.007352941
#> 12 C d 72 38 0.29149798 0.279411765
Created on 2021-12-20 by the reprex package (v2.0.1)
data.table
library(data.table)
COLS <- c("January", "February")
COLS_RES <- paste0(COLS, "_perc")
setDT(data)[, (COLS_RES) := lapply(.SD, proportions), by = group, .SDcol = COLS][]
These calculations are easier if your data is structured in a tidy way. In your case, January and February should probably be one single variable called month or something.
Example:
Underneath, I use tidyr::pivot_longer() to combine January and February into one column. Then I use the package dplyr to group the dataframe and calculate perc. I'm not using prop.table(), but I believe you just want the proportion of observation to the total of that group and month.
library(dplyr)
library(tidyr)
# To make the sampling underneath reproducable
set.seed(1)
data <- data.frame(
group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
January = sample(1:100,12),
February = sample(1:100,12)
)
data %>%
pivot_longer(c(January, February), names_to = "month", values_to = "x") %>%
group_by(group, month) %>%
mutate(
perc = round(x/sum(x), 2)
)
I hope this is what you were looking for.
Another dplyr solution:
library(dplyr)
data %>%
group_by(group) %>%
mutate(across(c(2:5),
~./sum(.)*100, .names = "{.col}_pct"))
# A tibble: 12 × 10
# Groups: group [3]
group Product Jan Feb Mar May Jan_pct Feb_pct Mar_pct May_pct
<chr> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 A a 14 14 95 50 8 18.4 44.4 20.9
2 A b 100 33 28 32 57.1 43.4 13.1 13.4
3 A c 11 16 13 95 6.29 21.1 6.07 39.7
4 A d 50 13 78 62 28.6 17.1 36.4 25.9
5 B a 29 42 72 13 22.0 33.9 20.3 7.07
6 B b 3 4 88 41 2.27 3.23 24.9 22.3
7 B c 30 68 94 86 22.7 54.8 26.6 46.7
8 B d 70 10 100 44 53.0 8.06 28.2 23.9
9 C a 4 88 45 84 3.96 43.6 24.2 30.7
10 C b 52 12 26 55 51.5 5.94 14.0 20.1
11 C c 26 20 23 57 25.7 9.90 12.4 20.8
12 C d 19 82 92 78 18.8 40.6 49.5 28.5
Data:
data <- data.frame(group = rep(LETTERS[1:3], each = 4),
Product = letters[1:4],
Jan = sample(1:100,12),
Feb = sample(1:100,12),
Mar = sample(1:100, 12),
May = sample(1:100, 12))
I have the following dataset
out
# A tibble: 1,356 x 7
ID GROUP Gender Age Education tests score
<dbl> <chr> <dbl> <dbl> <dbl> <chr> <dbl>
1 1 TRAINING 1 74 18 ADAS_CogT0 14.7
2 1 TRAINING 1 74 18 ROCF_CT0 32
3 1 TRAINING 1 74 18 ROCF_IT0 3.7
4 1 TRAINING 1 74 18 ROCF_RT0 3.9
5 1 TRAINING 1 74 18 PVF_T0 41.3
6 1 TRAINING 1 74 18 SVF_T0 40
7 1 TRAINING 1 74 18 ADAS_CogT7 16
8 1 TRAINING 1 74 18 ROCF_CT7 33
9 1 TRAINING 1 74 18 ROCF_IT7 1.7
10 1 TRAINING 1 74 18 ROCF_RT7 2.4
If I would like to create a column where in place of the tests ending with T0 would corresponf the value score0 whereas in place of tests ending with T7 the value would be score7`, which are the possible way to fulfill this?
Please be so kind put the data in your posts. >> dput(df)
You could use a combination of case_when and str_detect
library(dplyr)
library(stringr)
df <- structure(
list(
ID = 1:10,
GROUP = rep('TRAINING', 10),
Gender = rep(1, 10),
Education = rep(74, 10),
test = c(
'ADAS_CogT0',
'ROCF_CT0',
'ROCF_IT0',
'ROCF_RT0',
'PVF_T0',
'SVF_T0',
'ADAS_CogT7',
'ROCF_CT7',
'ROCF_IT7',
'ROCF_RT7'
),
score = c(14.7,32,3.7,3.9,41.3,40,16,33,1.7,2.4)
),
row.names = c(1:10),
class = "data.frame"
)
df2 <- df %>%
mutate(new = case_when(str_detect(test, 'T0') ~ 'score0',
str_detect(test, 'T7') ~ 'score7',
TRUE ~ test)
)
ID GROUP Gender Education test score new
1 1 TRAINING 1 74 ADAS_CogT0 14.7 score0
2 2 TRAINING 1 74 ROCF_CT0 32.0 score0
3 3 TRAINING 1 74 ROCF_IT0 3.7 score0
4 4 TRAINING 1 74 ROCF_RT0 3.9 score0
5 5 TRAINING 1 74 PVF_T0 41.3 score0
6 6 TRAINING 1 74 SVF_T0 40.0 score0
7 7 TRAINING 1 74 ADAS_CogT7 16.0 score7
8 8 TRAINING 1 74 ROCF_CT7 33.0 score7
9 9 TRAINING 1 74 ROCF_IT7 1.7 score7
10 10 TRAINING 1 74 ROCF_RT7 2.4 score7
Do you want the output to be string 'score0' and 'score7' ?
You may try -
library(dplyr)
out %>%
mutate(result = case_when(grepl('T0$', tests) ~ 'score0',
grepl('T7$', tests) ~ 'score7'))
# ID GROUP Gender Age Education tests score result
#1 1 TRAINING 1 74 18 ADAS_CogT0 14.7 score0
#2 1 TRAINING 1 74 18 ROCF_CT0 32.0 score0
#3 1 TRAINING 1 74 18 ROCF_IT0 3.7 score0
#4 1 TRAINING 1 74 18 ROCF_RT0 3.9 score0
#5 1 TRAINING 1 74 18 PVF_T0 41.3 score0
#6 1 TRAINING 1 74 18 SVF_T0 40.0 score0
#7 1 TRAINING 1 74 18 ADAS_CogT7 16.0 score7
#8 1 TRAINING 1 74 18 ROCF_CT7 33.0 score7
#9 1 TRAINING 1 74 18 ROCF_IT7 1.7 score7
#10 1 TRAINING 1 74 18 ROCF_RT7 2.4 score7
Or another option with readr::parse_number.
out %>%
mutate(result = paste0('score', readr::parse_number(tests)))
I have data as follows:
DT <- structure(list(ECOST = c("Choice_01", "Choice_02", "Choice_03",
"Choice_04", "Choice_05", "Choice_06", "Choice_07", "Choice_08",
"Choice_09", "Choice_10", "Choice_11", "Choice_12"), control = c(18,
30, 47, 66, 86, 35, 31, 46, 55, 39, 55, 41), treatment = c(31,
35, 46, 68, 86, 36, 32, 42, 52, 39, 58, 43), control_p = c(0.163636363636364,
0.272727272727273, 0.427272727272727, 0.6, 0.781818181818182,
0.318181818181818, 0.281818181818182, 0.418181818181818, 0.5,
0.354545454545455, 0.5, 0.372727272727273), treatment_p = c(0.319587628865979,
0.360824742268041, 0.474226804123711, 0.701030927835051, 0.88659793814433,
0.371134020618557, 0.329896907216495, 0.43298969072165, 0.536082474226804,
0.402061855670103, 0.597938144329897, 0.443298969072165)), row.names = c(NA,
-12L), class = c("tbl_df", "tbl", "data.frame"))
# A tibble: 12 x 5
ECOST control treatment control_p treatment_p
<chr> <dbl> <dbl> <dbl> <dbl>
1 Choice_01 18 31 0.164 0.320
2 Choice_02 30 35 0.273 0.361
3 Choice_03 47 46 0.427 0.474
4 Choice_04 66 68 0.6 0.701
5 Choice_05 86 86 0.782 0.887
6 Choice_06 35 36 0.318 0.371
7 Choice_07 31 32 0.282 0.330
8 Choice_08 46 42 0.418 0.433
9 Choice_09 55 52 0.5 0.536
10 Choice_10 39 39 0.355 0.402
11 Choice_11 55 58 0.5 0.598
12 Choice_12 41 43 0.373 0.443
I want to melt this data, but I want the columns control and control_p to stay together, and the columns treatment and treatment_p to stay together, creating a table with 24 rows and 4 columns.
Desired result:
# A tibble: 12 x 5
ECOST count percentage group
<chr> <dbl> <dbl>
1 Choice_01 18 0.164 control
2 Choice_02 30 0.273 control
3 Choice_03 47 0.427 control
4 Choice_04 66 0.6 control
5 Choice_05 86 0.782 control
6 Choice_06 35 0.318 control
7 Choice_07 31 0.282 control
8 Choice_08 46 0.418 control
9 Choice_09 55 0.5 control
10 Choice_10 39 0.355 control
11 Choice_11 55 0.5 control
12 Choice_12 41 0.373 control
13 Choice_01 18 0.320 treatment
14 Choice_02 30 0.361 treatment
15 Choice_03 46 0.474 treatment
16 Choice_04 68 0.701 treatment
17 Choice_05 86 0.887 treatment
18 Choice_06 36 0.371 treatment
19 Choice_07 32 0.330 treatment
20 Choice_08 42 0.433 treatment
21 Choice_09 52 0.536 treatment
22 Choice_10 39 0.402 treatment
23 Choice_11 58 0.598 treatment
24 Choice_12 43 0.443 treatment
Using pivot_longer, some data wrangling and afterwards pivot_wider you could achieve your desired result like so:
library(tidyr)
library(dplyr)
DT %>%
pivot_longer(-ECOST) %>%
separate(name, into = c("group", "what")) %>%
mutate(what = ifelse(is.na(what), "count", "percentage")) %>%
pivot_wider(names_from = "what", values_from = "value")
#> # A tibble: 24 x 4
#> ECOST group count percentage
#> <chr> <chr> <dbl> <dbl>
#> 1 Choice_01 control 18 0.164
#> 2 Choice_01 treatment 31 0.320
#> 3 Choice_02 control 30 0.273
#> 4 Choice_02 treatment 35 0.361
#> 5 Choice_03 control 47 0.427
#> 6 Choice_03 treatment 46 0.474
#> 7 Choice_04 control 66 0.6
#> 8 Choice_04 treatment 68 0.701
#> 9 Choice_05 control 86 0.782
#> 10 Choice_05 treatment 86 0.887
#> # … with 14 more rows
Created on 2021-02-21 by the reprex package (v1.0.0)
You could rename the columns so that you have clear distinction between count and percentage columns and then use pivot_longer
library(dplyr)
library(tidyr)
DT %>%
rename_with(~paste(sub('_.*', '', .),
rep(c('count', 'percentage'), each = 2), sep = '_'), -1) %>%
pivot_longer(cols = -ECOST,
names_to = c('group', '.value'),
names_sep = '_')
# A tibble: 24 x 4
# ECOST group count percentage
# <chr> <chr> <dbl> <dbl>
# 1 Choice_01 control 18 0.164
# 2 Choice_01 treatment 31 0.320
# 3 Choice_02 control 30 0.273
# 4 Choice_02 treatment 35 0.361
# 5 Choice_03 control 47 0.427
# 6 Choice_03 treatment 46 0.474
# 7 Choice_04 control 66 0.6
# 8 Choice_04 treatment 68 0.701
# 9 Choice_05 control 86 0.782
#10 Choice_05 treatment 86 0.887
# … with 14 more rows
Here is a data.table approach with a workaround for the limitation/feature of melt.data.table()
library( data.table )
setDT(DT)
#get suffixes
suffix <- unique( sub( "(^.*)(_[a-z])", "\\1", names( DT[ , -1] ) ) )
#melt
DT2 <- melt( DT, id.vars = "ECOST", measure.vars = patterns( count = "[a-oq-z]$", percentage = "_p$"))
#replace factor-levels with the colnames
setattr(DT2$variable, "levels", suffix )
ECOST variable count percentage
1: Choice_01 control 18 0.1636364
2: Choice_02 control 30 0.2727273
3: Choice_03 control 47 0.4272727
4: Choice_04 control 66 0.6000000
5: Choice_05 control 86 0.7818182
6: Choice_06 control 35 0.3181818
7: Choice_07 control 31 0.2818182
8: Choice_08 control 46 0.4181818
9: Choice_09 control 55 0.5000000
10: Choice_10 control 39 0.3545455
11: Choice_11 control 55 0.5000000
12: Choice_12 control 41 0.3727273
13: Choice_01 treatment 31 0.3195876
14: Choice_02 treatment 35 0.3608247
15: Choice_03 treatment 46 0.4742268
16: Choice_04 treatment 68 0.7010309
17: Choice_05 treatment 86 0.8865979
18: Choice_06 treatment 36 0.3711340
19: Choice_07 treatment 32 0.3298969
20: Choice_08 treatment 42 0.4329897
21: Choice_09 treatment 52 0.5360825
22: Choice_10 treatment 39 0.4020619
23: Choice_11 treatment 58 0.5979381
24: Choice_12 treatment 43 0.4432990
ECOST variable count percentage
I have two Data Frames. One is an Eye Tracking data frame with subject, condition, timestamp, xposition, and yposition. It has over 400,000 rows. Here's a toy data set for an example:
subid condition time xpos ypos
1 1 1 1.40 195 140
2 1 1 2.50 138 147
3 1 1 3.40 140 162
4 1 1 4.10 188 150
5 1 2 1.10 131 194
6 1 2 2.10 149 111
eyedata <- data.frame(subid = rep(1:2, each = 8),
condition = rep(rep(1:2, each = 4),2),
time = c(1.4, 2.5, 3.4, 4.1,
1.1, 2.1, 3.23, 4.44,
1.33, 2.3, 3.11, 4.1,
.49, 1.99, 3.01, 4.2),
xpos = round(runif(n = 16, min = 100, max = 200)),
ypos = round(runif(n = 16, min = 100, max = 200)))
Then I have a Data Frame with subject, condition, a trial number, and a trial begin and end time. It looks like this:
subid condition trial begin end
1 1 1 1 1.40 2.4
2 1 1 2 2.50 3.2
3 1 1 2 3.21 4.5
4 1 2 1 1.10 1.6
5 1 2 2 2.10 3.3
6 1 2 2 3.40 4.1
7 2 1 1 0.50 1.1
8 2 1 1 1.44 2.9
9 2 1 2 2.97 3.3
10 2 2 1 0.35 1.9
11 2 2 1 2.12 4.5
12 2 2 2 3.20 6.3
trials <- data.frame(subid = rep(1:2, each = 6),
condition = rep(rep(1:2, each = 3),2),
trial= c(rep(c(1,rep(2,2)),2),rep(c(rep(1,2),2),2)),
begin = c(1.4, 2.5, 3.21,
1.10, 2.10, 3.4, .50,
1.44,2.97,.35,2.12,3.20),
end = c(2.4,3.2,4.5,1.6,
3.3,4.1,1.1,2.9,
3.3,1.9,4.5,6.3))
The number of trials in a condition are variable, and I want to add a column to my eyetracking dataframe that specifies the correct trial based upon whether the timestamp falls within the time interval. The time intervals do not overlap, but there will be many rows for the eyetracking data in between trials. In the end I'd like a dataframe like this:
subid condition trial time xpos ypos
1 1 1 1.40 198 106
1 1 2 2.50 166 139
1 1 2 3.40 162 120
1 1 2 4.10 113 164
1 2 1 1.10 162 120
1 2 2 2.10 162 120
I've seen data.table rolling joins, but would prefer a solution with dplyr or fuzzyjoin. Thanks in advance.
Here's what I tried, but I can't figure the discrepancies, so it is likely an incomplete answer. Row 12,13 of this result may be an overlap in time. Also, when using random generation functions such as runif please set.seed -- here xpos and ypos have no bearing on the result, so not an issue.
eyedata %>%
left_join(trials, by = c("subid", "condition")) %>%
filter( (time >= begin & time <= end))
# subid condition time xpos ypos trial begin end
# 1 1 1 1.40 143 101 1 1.40 2.4
# 2 1 1 2.50 152 173 2 2.50 3.2
# 3 1 1 3.40 185 172 2 3.21 4.5
# 4 1 1 4.10 106 119 2 3.21 4.5
# 5 1 2 1.10 155 165 1 1.10 1.6
# 6 1 2 2.10 169 154 2 2.10 3.3
# 7 1 2 3.23 166 134 2 2.10 3.3
# 8 2 1 2.30 197 171 1 1.44 2.9
# 9 2 1 3.11 140 135 2 2.97 3.3
# 10 2 2 0.49 176 139 1 0.35 1.9
# 11 2 2 3.01 187 180 1 2.12 4.5
# 12 2 2 4.20 147 176 1 2.12 4.5
# 13 2 2 4.20 147 176 2 3.20 6.3