How to find min and max in dplyr? - r

I know the sum of points for each person.
I need to know: what is the minimum number of points that a person could have. And what is the maximum number of points that a person could have.
What I have tried:
min_and_max <- dataset %>%
group_by(person) %>%
dplyr::filter(min(sum(points, na.rm = T))) %>%
distinct(person) %>%
pull()
min_and_max
My dataset:
id person points
201 rt99 NA
201 rt99 3
201 rt99 2
202 kt 4
202 kt NA
202 kt NA
203 rr 4
203 rr NA
203 rr NA
204 jk 2
204 jk 2
204 jk NA
322 knm3 5
322 knm3 NA
322 knm3 3
343 kll2 2
343 kll2 1
343 kll2 5
344 kll NA
344 kll 7
344 kll 1

I would suggest this dplyr approach. You have to summarize data like this:
library(tidyverse)
#Code
df %>% group_by(id,person) %>%
summarise(Total=sum(points,na.rm = T),
min=min(points,na.rm = T),
max=max(points,na.rm=T))
Output:
# A tibble: 7 x 5
# Groups: id [7]
id person Total min max
<int> <chr> <int> <int> <int>
1 201 rt99 5 2 3
2 202 kt 4 4 4
3 203 rr 4 4 4
4 204 jk 4 2 2
5 322 knm3 8 3 5
6 343 kll2 8 1 5
7 344 kll 8 1 7

Here is the data.table solution -
dataset[, min_points := min(points, na.rm = T), by = person]
dataset[, max_points := max(points, na.rm = T), by = person]
Since I don't have your data, I cannot test this code, but it should work fine.

The summarize() verb is what you want for this. You don't even need to filter out the NA values first since both min() and max() can have na.rm = TRUE.
library(dplyr)
min_and_max <- dataset %>%
group_by(person) %>%
summarize(min = min(points, na.rm = TRUE),
max = max(points, na.rm = TRUE))
min_and_max
# A tibble: 7 x 3
person min max
<chr> <dbl> <dbl>
1 jk 2 2
2 kll 1 7
3 kll2 1 5
4 knm3 3 5
5 kt 4 4
6 rr 4 4
7 rt99 2 3
dput(dataset)
structure(list(id = c(201, 201, 201, 202, 202, 202, 203, 203,
203, 204, 204, 204, 322, 322, 322, 343, 343, 343, 344, 344, 344
), person = c("rt99", "rt99", "rt99", "kt", "kt", "kt", "rr",
"rr", "rr", "jk", "jk", "jk", "knm3", "knm3", "knm3", "kll2",
"kll2", "kll2", "kll", "kll", "kll"), points = c(NA, 3, 2, 4,
NA, NA, 4, NA, NA, 2, 2, NA, 5, NA, 3, 2, 1, 5, NA, 7, 1)), class = "data.frame", row.names = c(NA,
-21L), spec = structure(list(cols = list(id = structure(list(), class = c("collector_double",
"collector")), person = structure(list(), class = c("collector_character",
"collector")), points = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))

Related

replacing rowwise() operations in grouped data

Anonymised example subset of a much larger dataset (now edited to show an option with multiple competing types):
structure(list(`Sample File` = c("A", "A", "A", "A", "A", "A",
"A", "A", "A", "B", "B", "B", "B", "B", "C", "C", "C", "C"),
Marker = c("X", "X", "X", "X", "Y", "Y", "Y", "Y", "Y", "Z",
"Z", "Z", "Z", "Z", "q", "q", "q", "q"), Allele = c(19, 20,
22, 23, 18, 18.2, 19, 19.2, 20, 12, 13, 14, 15, 16, 10, 10.2,
11, 12), Size = c(249.15, 253.13, 260.64, 264.68, 366, 367.81,
369.97, 372.02, 373.95, 91.65, 95.86, 100, 104.24, 108.38,
177.51, 179.4, 181.42, 185.49), Height = c(173L, 1976L, 145L,
1078L, 137L, 62L, 1381L, 45L, 1005L, 38L, 482L, 5766L, 4893L,
19L, 287L, 36L, 5001L, 50L), Type = c("minusone", "allele",
"minusone", "allele", "ambiguous", "minushalf", "allele",
"minushalf", "allele", "minustwo", "ambiguous", "allele",
"allele", "plusone", "minusone", "minushalf", "allele", "plusone"
), LUS = c(11.75, 11.286, 13.375, 13.5, 18, 9, 19, 10, 20,
12, 11, 14, 15, 16, 9.5, NA, 11, 11.5)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -18L), groups = structure(list(
`Sample File` = c("A", "A", "B", "C"), Marker = c("X", "Y",
"Z", "q"), .rows = structure(list(1:4, 5:9, 10:14, 15:18), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4L), .drop = TRUE))
I want to look up values based on the classification $Type.
"minustwo" means I want to look up the "Allele", "Height" and "LUS"
values for the row with "Allele" equal to the current row plus two,
with the same Sample File and Marker.
"minusone" means the same but for "Allele" equal to the current row plus one.
"minushalf" means the same but for "Allele" equal to the current row plus 0.2 but the dot values here are 25% each, so 12.1, 12.3, 12.3, 13, 13.1 etc - I have a helper function plusTwoBP() for this.
"plusone" means the same for "Allele" equal to the current row -1
"allele" or "ambiguous" don't need to do anything.
Ideal output:
# A tibble: 18 × 10
# Rowwise: Sample File, Marker
`Sample File` Marker Allele Size Height Type LUS ParentHeight ParentAllele ParentLUS
<chr> <chr> <dbl> <dbl> <int> <chr> <dbl> <int> <dbl> <dbl>
1 A X 19 249. 173 minusone 11.8 1976 20 11.3
2 A X 20 253. 1976 allele 11.3 NA NA NA
3 A X 22 261. 145 minusone 13.4 1078 23 13.5
4 A X 23 265. 1078 allele 13.5 NA NA NA
5 A Y 18 366 137 ambiguous 18 NA NA NA
6 A Y 18.2 368. 62 minushalf 9 1381 19 19
7 A Y 19 370. 1381 allele 19 NA NA NA
8 A Y 19.2 372. 45 minushalf 10 1005 20 20
9 A Y 20 374. 1005 allele 20 NA NA NA
10 B Z 12 91.6 38 minustwo 12 5766 14 14
11 B Z 13 95.9 482 ambiguous 11 NA NA NA
12 B Z 14 100 5766 allele 14 NA NA NA
13 B Z 15 104. 4893 allele 15 NA NA NA
14 B Z 16 108. 19 plusone 16 4893 15 15
15 C q 10 178. 287 minusone 9.5 5001 11 11
16 C q 10.2 179. 36 minushalf NA 5001 11 11
17 C q 11 181. 5001 allele 11 NA NA NA
18 C q 12 185. 50 plusone 11.5 5001 11 11
I have a rather belaboured way of doing it:
# eg for minustwo
sampleData %>%
filter(Type == "minustwo") %>%
rowwise() %>%
mutate(ParentHeight = sampleData$Height[sampleData$`Sample File` == `Sample File` & sampleData$Marker == Marker & sampleData$Allele == (Allele + 2)],
ParentAllele = sampleData$Allele[sampleData$`Sample File` == `Sample File` & sampleData$Marker == Marker & sampleData$Allele == (Allele + 2)],
ParentLUS = sampleData$LUS[sampleData$`Sample File` == `Sample File` & sampleData$Marker == Marker & sampleData$Allele == (Allele + 2)]) %>%
right_join(sampleData)
I then have to redo that for each of my Types
My real dataset is thousands of rows so this ends up being a little slow but manageable, but more to the point I want to learn a better way to do it, in particular the sampleData$'Sample File' == 'Sample File' & sampleData$Marker == Marker seems like it should be doable with grouping so I must be missing a trick there.
I have tried using group_map() but I've clearly not understood it correctly:
sampleData$ParentHeight <- sampleData %>%
group_by(`Sample File`, `Marker`) %>%
group_map(.f = \(.x, .y) {
pmap_dbl(.l = .x, .f = \(Allele, Height, Type, ...){
if(Type == "allele" | Type == "ambiguous") { return(0)
} else if (Type == "plusone") {
return(.x$Height[.x$Allele == round(Allele - 1, 1)])
} else if (Type == "minushalf") {
return(.x$Height[.x$Allele == round(plustwoBP(Allele), 1)])
} else if (Type == "minusone") {
return(.x$Height[.x$Allele == round(Allele + 1, 1)])
} else if (Type == "minustwo") {
return(.x$Height[.x$Allele == round(Allele + 2, 1)])
} else { stop("unexpected peak type") }
})}) %>% unlist()
Initially seems to work, but on investigation it's not respecting both layers of grouping, so brings matches from the wrong Marker. Additionally, here I'm assigning the output to a new column in the data frame, but if I try to instead wrap a mutate() around this so that I can create all three new columns in one go then the group_map() no longer works at all.
I also considered using complete() to hugely extend the data frame will all possible values of Allele (including x.0, x.1, x.2, x.3 variants) then use lag() to select the corresponding rows, then drop the spare rows. This seems like it'd make the data frame enormous in the interim.
To summarise
This works, but it feels ugly and like I'm missing a more elegant and obvious solution. How would you approach this?
You can create two versions of Allele: one identical to the original Allele, and one that is equal to an adjustment based on minusone, minustwo, etc
Then do a self left join, based on that adjusted version of Allele (and Sample File and Marker)
sampleData = sampleData %>% group_by(`Sample File`,Marker) %>% mutate(id = Allele) %>% ungroup()
left_join(
sampleData %>%
mutate(id = case_when(
Type=="minusone"~id+1,
Type=="minustwo"~id+2,
Type=="plusone"~id-1,
Type=="minushalf"~ceiling(id))),
sampleData %>% select(-c(Size,Type)),
by=c("Sample File", "Marker", "id"),
suffix = c("", ".parent")
) %>% select(-id)
Output:
# A tibble: 14 × 10
`Sample File` Marker Allele Size Height Type LUS Allele.parent Height.parent LUS.parent
<chr> <chr> <dbl> <dbl> <int> <chr> <dbl> <dbl> <int> <dbl>
1 A X 19 249. 173 minusone 11.8 20 1976 11.3
2 A X 20 253. 1976 allele 11.3 NA NA NA
3 A X 22 261. 145 minusone 13.4 23 1078 13.5
4 A X 23 265. 1078 allele 13.5 NA NA NA
5 A Y 18 366 137 ambiguous 18 NA NA NA
6 A Y 18.2 368. 62 minushalf 9 19 1381 19
7 A Y 19 370. 1381 allele 19 NA NA NA
8 A Y 19.2 372. 45 minushalf 10 20 1005 20
9 A Y 20 374. 1005 allele 20 NA NA NA
10 B Z 12 91.6 38 minustwo 12 14 5766 14
11 B Z 13 95.9 482 ambiguous 11 NA NA NA
12 B Z 14 100 5766 allele 14 NA NA NA
13 B Z 15 104. 4893 allele 15 NA NA NA
14 B Z 16 108. 19 plusone 16 15 4893 15
15 C q 10 178. 287 minusone 9.5 11 5001 11
16 C q 10.2 179. 36 minushalf NA 11 5001 11
17 C q 11 181. 5001 allele 11 NA NA NA
18 C q 12 185. 50 plusone 11.5 11 5001 11

How to set missing some columns and their corresponding columns in data frame in R

I have a longitudinal data with three follow-up. The columns 2,3 and 4
I want to set the value 99 in the columns v_9, v_01, and v_03 to NA, but I want to set their corresponding columns (columns "d_9", "d_01","d_03" and "a_9", "a_01","a_03") as NA as well. As an example for ID 101 as below:
How can I do this for all the individuals and my whole data set in R? thanks in advance for the help.
"id" "v_9" "v_01" "v_03" "d_9" "d_01" "d_03" "a_9" "a_01" "a_03"
101 12 NA 10 2015-03-23 NA 2003-06-19 40.50650 NA 44.1065
structure(list(id = c(101, 102, 103, 104), v_9 = c(12, 99, 16,
25), v_01 = c(99, 12, 16, NA), v_03 = c(10, NA, 99, NA), d_9 = structure(c(16517,
17613, 16769, 10667), class = "Date"), d_01 = structure(c(13291,
NA, 13566, NA), class = "Date"), d_03 = structure(c(12222, NA,
12119, NA), class = "Date"), a_9 = c(40.5065, 40.5065, 30.19713,
51.40862), a_01 = c(42.5065, 41.5112, 32.42847, NA), a_03 = c(44.1065,
NA, 35.46543, NA)), row.names = c(NA, -4L), class = c("tbl_df",
"tbl", "data.frame"))
Try this function:
fn <- function(df){
for(s in c("_9" , "_01" , "_03")){
i <- which(`[[`(df,paste0("v",s)) == 99)
df[i, paste0("v",s)] <- NA
df[i, paste0("d",s)] <- NA
df[i, paste0("a",s)] <- NA
}
df
}
df <- fn(df)
Output
# A tibble: 4 × 10
id v_9 v_01 v_03 d_9 d_01 d_03 a_9 a_01 a_03
<dbl> <dbl> <dbl> <dbl> <date> <date> <date> <dbl> <dbl> <dbl>
1 101 12 NA 10 2015-03-23 NA 2003-06-19 40.5 NA 44.1
2 102 NA 12 NA NA NA NA NA 41.5 NA
3 103 16 16 NA 2015-11-30 2007-02-22 NA 30.2 32.4 NA
4 104 25 NA NA 1999-03-17 NA NA 51.4 NA NA

Merging / Joining Data While Keeping All Data from Both Data Frames (even if no matches)

I have two data frames that I am trying to combine into one master data frame by ID and Date. My issue is that the data frames have some similar and some unique dates. One data frame goes sequentially through the dates but may be missing a day or two at the beginning and/or end, while the other data frame has multiple samples from the same ID and is only sampled every 3 days.
DF1 example:
Nest.ID Date X Y Nest.ID.Date
AMRO_1_ 5/2/20 7 2 AMRO_1_5-02-20
AMRO_1_ 5/3/20 1 5 AMRO_1_5-03-20
AMRO_1_ 5/4/20 7 9 AMRO_1_5-04-20
AMRO_1_ 5/5/20 3 2 AMRO_1_5-05-20
AMRO_1_ 5/6/20 1 3 AMRO_1_5-06-20
DF2 Example
Nest.ID Indiv.ID Date U V Nest.ID.Date
AMRO_1_ A 5/1/20 468 294 AMRO_1_5-01-20
AMRO_1_ B 5/1/20 454 456 AMRO_1_5-01-20
AMRO_1_ C 5/1/20 436 245 AMRO_1_5-01-20
AMRO_1_ A 5/4/20 356 762 AMRO_1_5-04-20
AMRO_1_ B 5/4/20 345 953 AMRO_1_5-04-20
AMRO_1_ C 5/4/20 356 345 AMRO_1_5-04-20
AMRO_1_ A 5/7/20 763 193 AMRO_1_5-07-20
AMRO_1_ B 5/7/20 763 186 AMRO_1_5-07-20
AMRO_1_ C 5/7/20 235 762 AMRO_1_5-07-20
Wanted Outcome:
Nest.ID Date X Y Indiv.ID U V
AMRO_1_ 5/1/20 NA NA A 468 294
AMRO_1_ 5/1/20 NA NA B 454 456
AMRO_1_ 5/1/20 NA NA C 436 245
AMRO_1_ 5/2/20 7 2 NA NA NA
AMRO_1_ 5/3/20 1 5 NA NA NA
AMRO_1_ 5/4/20 7 9 A 356 762
AMRO_1_ 5/4/20 7 9 B 345 953
AMRO_1_ 5/4/20 7 9 C 356 345
AMRO_1_ 5/5/20 3 2 NA NA NA
AMRO_1_ 5/6/20 1 3 NA NA NA
AMRO_1_ 5/7/20 NA NA A 763 193
AMRO_1_ 5/7/20 NA NA B 763 186
AMRO_1_ 5/7/20 NA NA C 235 762
Using a full_join with keep= TRUE I can get very close and get a final DF with all the data from DF1 and with NAs where DF2 wasn't sampled, but I cannot figure out how to also include the beginning/end dates that are not in DF1 but in DF2 (so 5/1 and 5/7 in the examples). I suspect this may be due to my "key" of ID.Date, but those are the only variables I can use to keep things in order when merging (in other words, since I have multiple samples per ID, I can't just use ID as my "key").
You want to do a full_join() from dplyr.
Here are the data in friendlier form:
DF1 <- structure(list(Nest.ID = c("AMRO_1_", "AMRO_1_", "AMRO_1_", "AMRO_1_",
"AMRO_1_"), Date = c("5/2/20", "5/3/20", "5/4/20", "5/5/20",
"5/6/20"), X = c(7, 1, 7, 3, 1), Y = c(2, 5, 9, 2, 3), Nest.ID.Date = c("AMRO_1_5-02-20",
"AMRO_1_5-03-20", "AMRO_1_5-04-20", "AMRO_1_5-05-20", "AMRO_1_5-06-20"
)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-5L), spec = structure(list(cols = list(Nest.ID = structure(list(), class = c("collector_character",
"collector")), Date = structure(list(), class = c("collector_character",
"collector")), X = structure(list(), class = c("collector_double",
"collector")), Y = structure(list(), class = c("collector_double",
"collector")), Nest.ID.Date = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
DF2 <- structure(list(Nest.ID = c("AMRO_1_", "AMRO_1_", "AMRO_1_", "AMRO_1_",
"AMRO_1_", "AMRO_1_", "AMRO_1_", "AMRO_1_", "AMRO_1_"), Indiv.ID = c("A",
"B", "C", "A", "B", "C", "A", "B", "C"), Date = c("5/1/20", "5/1/20",
"5/1/20", "5/4/20", "5/4/20", "5/4/20", "5/7/20", "5/7/20", "5/7/20"
), U = c(468, 454, 436, 356, 345, 356, 763, 763, 235), V = c(294,
456, 245, 762, 953, 345, 193, 186, 762), Nest.ID.Date = c("AMRO_1_5-01-20",
"AMRO_1_5-01-20", "AMRO_1_5-01-20", "AMRO_1_5-04-20", "AMRO_1_5-04-20",
"AMRO_1_5-04-20", "AMRO_1_5-07-20", "AMRO_1_5-07-20", "AMRO_1_5-07-20"
)), class = c("spec_tbl_df", "tbl_df", "tbl", "data.frame"), row.names = c(NA,
-9L), spec = structure(list(cols = list(Nest.ID = structure(list(), class = c("collector_character",
"collector")), Indiv.ID = structure(list(), class = c("collector_character",
"collector")), Date = structure(list(), class = c("collector_character",
"collector")), U = structure(list(), class = c("collector_double",
"collector")), V = structure(list(), class = c("collector_double",
"collector")), Nest.ID.Date = structure(list(), class = c("collector_character",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
Here is the code.
library(dplyr)
library(lubridate)
DF1 %>% full_join(DF2) %>%
select(-Nest.ID.Date) %>%
mutate(Date = mdy(Date)) %>%
arrange(Date)
Joining, by = c("Nest.ID", "Date", "Nest.ID.Date")
# A tibble: 13 x 7
Nest.ID Date X Y Indiv.ID U V
<chr> <date> <dbl> <dbl> <chr> <dbl> <dbl>
1 AMRO_1_ 2020-05-01 NA NA A 468 294
2 AMRO_1_ 2020-05-01 NA NA B 454 456
3 AMRO_1_ 2020-05-01 NA NA C 436 245
4 AMRO_1_ 2020-05-02 7 2 NA NA NA
5 AMRO_1_ 2020-05-03 1 5 NA NA NA
6 AMRO_1_ 2020-05-04 7 9 A 356 762
7 AMRO_1_ 2020-05-04 7 9 B 345 953
8 AMRO_1_ 2020-05-04 7 9 C 356 345
9 AMRO_1_ 2020-05-05 3 2 NA NA NA
10 AMRO_1_ 2020-05-06 1 3 NA NA NA
11 AMRO_1_ 2020-05-07 NA NA A 763 193
12 AMRO_1_ 2020-05-07 NA NA B 763 186
13 AMRO_1_ 2020-05-07 NA NA C 235 762

Splitting one column into two columns using data wrangling with R

I would really appreciate your help in using R for data wrangling. I have a data where I want to split one column (variable) into two whenever applicable as conditioned by other variables. For example, as per the sample below, the data represents reactions time measures (RT1 and RT2) of some words (item) that appear in different times of reading (block). I want to see if RT1 and RT2 values in block 3, 4, and 5 are correlated with RT1 and RT2 values of the same item at block 1. The target items that appeared in block 1 and re-appeared in subsequent blocks are coded as 'EI' in the column 'condition', whereas items coded as 'E' or 'I' appeared only once.
dput(d1)
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253
), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I")), row.names = c(NA, -8L
), class = c("tbl_df", "tbl", "data.frame"))
Where a sample of the data would look like this:
> d1
# A tibble: 8 x 6
RECORDING_SESSION_LABEL RT1 RT2 item block condition
<dbl> <dbl> <dbl> <chr> <dbl> <chr>
1 26 5171 357 foreign 1 EI
2 26 3857 328 detailed 1 E
3 26 3447 122 large 1 EI
4 26 314 39 foreign 3 EI
5 26 460 86 foreign 4 EI
6 26 731 132 large 3 EI
7 26 957 173 large 4 EI
8 26 1253 215 disputable 3 I
In order to present in a format that R would understand, the target data frame I want to achieve would be similar to the one below (where the highlighted columns should be added). Rows in blanks at these columns represent items which do not appear repetitively (condition is not coded as 'EI') ; therefore, they are irrelevant and should be coded as 'NA'.
dput(d2)
structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26,
26, 26, 26), `RT 1` = c(5171, 3857, 3447, 314, 460, 731, 957,
1253), RT2 = c(357, 328, 122, 39, 86, 132, 173, 215), item = c("foreign",
"detailed", "large", "foreign", "foreign", "large", "large",
"disputable"), block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI",
"E", "EI", "EI", "EI", "EI", "EI", "I"), `RT 1_at_block1` = c(NA,
NA, NA, 5171, 5171, 3447, 3447, NA), RT2_at_block1 = c(NA, NA,
NA, 357, 357, 122, 122, NA)), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
And a sample of the data format targeted would look like this:
> d2
# A tibble: 8 x 8
RECORDING_SESSI~ `RT 1` RT2 item block condition `RT 1_at_block1`
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl>
1 26 5171 357 fore~ 1 EI NA
2 26 3857 328 deta~ 1 E NA
3 26 3447 122 large 1 EI NA
4 26 314 39 fore~ 3 EI 5171
5 26 460 86 fore~ 4 EI 5171
6 26 731 132 large 3 EI 3447
7 26 957 173 large 4 EI 3447
8 26 1253 215 disp~ 3 I NA
# ... with 1 more variable: RT2_at_block1 <dbl>
> head(d2)
# A tibble: 6 x 8
RECORDING_SESSION_LABEL `RT 1` RT2 item block condition `RT 1_at_block1` RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI NA NA
2 26 3857 328 detailed 1 E NA NA
3 26 3447 122 large 1 EI NA NA
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Thanks in advance for any help.
A possible solution using dplyr:
d1 <- structure(list(RECORDING_SESSION_LABEL = c(26, 26, 26, 26, 26, 26, 26, 26),
RT1 = c(5171, 3857, 3447, 314, 460, 731, 957, 1253),
RT2 = c(357, 328, 122, 39, 86, 132, 173, 215),
item = c("foreign", "detailed", "large", "foreign", "foreign", "large", "large", "disputable"),
block = c(1, 1, 1, 3, 4, 3, 4, 3), condition = c("EI", "E", "EI", "EI", "EI", "EI", "EI", "I")),
row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame"))
library(dplyr)
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2))
After that, d2 looks like this:
RECORDING_SESSION_LABEL RT1 RT2 item block condition RT1_at_block1 RT2_at_block1
<dbl> <dbl> <dbl> <chr> <dbl> <chr> <dbl> <dbl>
1 26 5171 357 foreign 1 EI 5171 357
2 26 3857 328 detailed 1 E 3857 328
3 26 3447 122 large 1 EI 3447 122
4 26 314 39 foreign 3 EI 5171 357
5 26 460 86 foreign 4 EI 5171 357
6 26 731 132 large 3 EI 3447 122
Edit: Adding a mutate if you want to set the values for block 1 to NA:
d2 <- d1 %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT1_at_block1 = RT1)) %>%
left_join(d1 %>% filter(block == 1) %>% select(RECORDING_SESSION_LABEL, item, RT2_at_block1 = RT2)) %>%
mutate(RT1_at_block1 = ifelse(block == 1, NA, RT1_at_block1),
RT2_at_block1 = ifelse(block == 1, NA, RT2_at_block1))

R lapply update list of data.tables with list - no such index at level 1

I am trying to update a list of date.tables with a list, that seems like it should work as it does in this example:
set.seed(1965)
dt_lst <- list(dt1 <- data.table(a = rnorm(1:4),
b = c(4,3,2,1)), dt2 <- data.table(c = rnorm(1:5),
d = letters[1:5]))
> dt_lst
[[1]]
a b
1: 0.8428429 4
2: 0.2958355 3
3: -1.0520980 2
4: 0.9628192 1
[[2]]
c d
1: -0.05033855 a
2: -0.94065157 b
3: 1.20459624 c
4: -0.47791557 d
5: -0.30362496 e
Now a list for the update (someone said dt1 was group 1 and dt2 was group2 and group needed to be in the results):
group1 <- list(1,2)
And lapply update:
dt_lst_tst <- lapply(seq_along(dt_lst),
function(x)
dt_lst[[x]][, group:= group1[[x]]])
> dt_lst_tst
[[1]]
a b group
1: 0.8428429 4 1
2: 0.2958355 3 1
3: -1.0520980 2 1
4: 0.9628192 1 1
[[2]]
c d group
1: -0.05033855 a 2
2: -0.94065157 b 2
3: 1.20459624 c 2
4: -0.47791557 d 2
5: -0.30362496 e 2
Perfect, and characteristic of my data where I never know how big a data.table I'll have (nrows) nor which 'group' it is supposed to be in
until after the fact, hence updating.
So now, with a very small amount of my data:
> dput(combine_sub1)
list(structure(list(smp = 1:4, x = c(491, 491, 491, 491), y = c(798,
798, 798, 798)), .Names = c("smp", "x", "y"), class = c("data.table",
"data.frame"), row.names = c(NA, -4L), .internal.selfref = <pointer:
0x2b859d8>),
structure(list(smp = 1:6, x = c(650, 650, 650, 650, 650,
650), y = c(437, 437, 437, 437, 437, 437)), .Names = c("smp",
"x", "y"), class = c("data.table", "data.frame"), row.names = c(NA,
-6L), .internal.selfref = <pointer: 0x2b859d8>), structure(list(
smp = 1:5, x = c(480, 485, 540, 572, 589), y = c(462,
462, 455, 451, 450)), .Names = c("smp", "x", "y"), class =
c("data.table",
"data.frame"), row.names = c(NA, -5L), .internal.selfref = <pointer:
0x2b859d8>))
> combine_sub1
[[1]]
smp x y
1: 1 491 798
2: 2 491 798
3: 3 491 798
4: 4 491 798
[[2]]
smp x y
1: 1 650 437
2: 2 650 437
3: 3 650 437
4: 4 650 437
5: 5 650 437
6: 6 650 437
[[3]]
smp x y
1: 1 480 462
2: 2 485 462
3: 3 540 455
4: 4 572 451
5: 5 589 450
group3_lst <- list(1,2,3)
> group3_lst
[[1]]
[1] 1
[[2]]
[1] 2
[[3]]
[1] 3
And using similar lapply as above:
> combine_sub1_tst <- lapply(seq_along(combine_sub1),
+ function(x)
+ combine_sub1[[x]][ , group := group3_lst[[x]]])
Error in group3_lst[[x]] : no such index at level 1
And I can't figure out why the difference. Any help appreciated.
The problem seems to have been caused by the use of variable x in the function call and it interferes with the x column in the data.tables in group3_lst. Use a difference variable name that's not in those data.tables it will work fine, e.g. use i: combine_sub1_tst <- lapply(seq_along(combine_sub1), function(i) combine_sub1[[i]][ , group := group3_lst[[i]]])

Resources