R: ranking variable per trial according to time column - r

My data looks like this:
Subject Trial Task Time Fixation
..
1 1 2 1 0.335
1 1 2 456 NA
1 1 2 765 0.165
1 1 2 967 0.445
..
2 3 1 1 0.665
2 3 1 300 0.556
2 3 1 570 NA
2 3 1 900 NA
..
15 5 3 1 0.766
15 5 3 567 0.254
15 5 3 765 0.167
15 5 3 1465 NA
..
I want to create a column FixationID where I want to rank every Fixation per Trial according to Time column (1,2,3,4..). Time column shows time course in milliseconds for every trial and every Trial starts with 1. Trials have different lengths.
I want my data to look like this:
Subject Trial Task Time Fixation FixationID
..
1 1 2 1 0.335 1
1 1 2 456 NA NA
1 1 2 765 0.165 2
1 1 2 967 0.445 3
..
2 3 1 1 0.665 1
2 3 1 300 0.556 2
2 3 1 570 NA NA
2 3 1 900 NA NA
..
15 5 3 1 0.766 1
15 5 3 567 0.254 2
15 5 3 765 0.167 3
15 5 3 1465 NA NA
..
I tried
library(data.table)
setDT(mydata)[!is.na(Fixation), FixID :=
seq_len(.N)[order(Time)], by = Trial]
but what I get is ranking 1,16,31,45,57.. for my Subject 1 Trial 1. I want 1,2,3,4,5..
Can anyone help me with this?
Excerpt from my data:
structure(list(Subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L), Trial = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L), Task = c(2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L), Time = c(1L, 385L, 571L, 638L, 951L,
1020L, 1349L, 1401L, 1661L, 1706L, 2042L, 2067L, 2322L, 2375L,
2540L, 2660L, 2686L, 3108L, 3172L, 3423L, 3462L, 3845L, 3870L,
3969L, 4099L, 4132L, 1L, 471L, 513L, 697L), Fixation = c(0.383,
0.185, NA, 0.312, NA, 0.328, NA, 0.259, NA, 0.335, NA, 0.254,
NA, 0.164, 0.119, NA, 0.421, NA, 0.25, NA, 0.382, NA, 0.0979999999999999,
0.129, NA, 0.335, 0.469, NA, 0.183, NA)), .Names = c("Subject",
"Trial", "Task", "Time", "Fixation"), row.names = c(NA, 30L), class = "data.frame")

What about this:
library(data.table)
setDT(mydata)
mydata[!is.na(Fixation), FixID := frank(Time), by = Trial]
head(mydata, 10)
Subject Trial Task Time Fixation FixID
1: 1 1 2 1 0.383 1
2: 1 1 2 385 0.185 2
3: 1 1 2 571 NA NA
4: 1 1 2 638 0.312 3
5: 1 1 2 951 NA NA
6: 1 1 2 1020 0.328 4
7: 1 1 2 1349 NA NA
8: 1 1 2 1401 0.259 5
9: 1 1 2 1661 NA NA
10: 1 1 2 1706 0.335 6
tail(mydata, 10)
Subject Trial Task Time Fixation FixID
1: 1 1 2 3462 0.382 12
2: 1 1 2 3845 NA NA
3: 1 1 2 3870 0.098 13
4: 1 1 2 3969 0.129 14
5: 1 1 2 4099 NA NA
6: 1 1 2 4132 0.335 15
7: 1 2 2 1 0.469 1
8: 1 2 2 471 NA NA
9: 1 2 2 513 0.183 2
10: 1 2 2 697 NA NA

Using ave on as.logical(Fixation) and #josliber's NA-ignoring cumsum code.
mydata$FixationID <-
with(mydata, ave(as.logical(Fixation), Subject, Trial, FUN=function(x)
cumsum(ifelse(is.na(x), 0, x)) + x*0))
Result
mydata
# Subject Trial task Time Fixation FixationID
# 1 1 1 1 1 0.596 1
# 10 1 1 1 500 0.016 2
# 19 1 1 1 512 NA NA
# 28 1 1 1 524 NA NA
# 4 1 2 2 1 0.688 1
# 13 1 2 2 501 NA NA
# 22 1 2 2 513 NA NA
# 31 1 2 2 525 NA NA
# 7 1 3 3 1 0.582 1
# 16 1 3 3 502 NA NA
# 25 1 3 3 514 0.369 2
# 34 1 3 3 526 0.847 3
# 2 2 1 1 1 NA NA
# 11 2 1 1 503 0.779 1
# 20 2 1 1 515 0.950 2
# 29 2 1 1 527 0.304 3
# 5 2 2 2 1 0.158 1
# 14 2 2 2 504 0.281 2
# 23 2 2 2 516 0.360 3
# 32 2 2 2 528 0.535 4
# 8 2 3 3 1 NA NA
# 17 2 3 3 505 0.717 1
# 26 2 3 3 517 NA NA
# 35 2 3 3 529 0.959 2
# 3 3 1 1 1 0.174 1
# 12 3 1 1 506 0.278 2
# 21 3 1 1 518 0.784 3
# 30 3 1 1 530 NA NA
# 6 3 2 2 1 0.439 1
# 15 3 2 2 507 0.857 2
# 24 3 2 2 519 NA NA
# 33 3 2 2 531 0.019 3
# 9 3 3 3 1 0.175 1
# 18 3 3 3 508 0.314 2
# 27 3 3 3 520 NA NA
# 36 3 3 3 532 0.845 3
Data
mydata <- structure(list(Subject = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), Trial = c(1L, 1L,
1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L,
2L, 2L, 3L, 3L, 3L, 3L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L,
3L, 3L), task = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1,
1, 2, 2, 2, 2, 3, 3, 3, 3, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
), Time = c(1, 500, 512, 524, 1, 501, 513, 525, 1, 502, 514,
526, 1, 503, 515, 527, 1, 504, 516, 528, 1, 505, 517, 529, 1,
506, 518, 530, 1, 507, 519, 531, 1, 508, 520, 532), Fixation = c(0.596,
0.016, NA, NA, 0.688, NA, NA, NA, 0.582, NA, 0.369, 0.847, NA,
0.779, 0.95, 0.304, 0.158, 0.281, 0.36, 0.535, NA, 0.717, NA,
0.959, 0.174, 0.278, 0.784, NA, 0.439, 0.857, NA, 0.019, 0.175,
0.314, NA, 0.845)), row.names = c(1L, 10L, 19L, 28L, 4L, 13L,
22L, 31L, 7L, 16L, 25L, 34L, 2L, 11L, 20L, 29L, 5L, 14L, 23L,
32L, 8L, 17L, 26L, 35L, 3L, 12L, 21L, 30L, 6L, 15L, 24L, 33L,
9L, 18L, 27L, 36L), class = "data.frame")

Here is another option which should be fast:
setDT(mydata)[!is.na(Fixation), FixID := .SD[order(Trial, Time), rowid(Trial)]]
mydata

Related

Get new columns based on data from other columns

My data:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L)), class = "data.frame", row.names = c(NA,
-18L))
I want to get 2 new columns based on col1 and col2.
column 3 is obtained: We leave units if there is zero in the second column, 2 are simply transferred.
column 4 will turn out: We leave units if there is one in the second column, 2 are simply transferred.
What I want to get:
data <- structure(list(col1 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), col2 = c(0L, 1L, 1L, 0L, 0L,
1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), group1 = c(1L,
NA, NA, 1L, 1L, NA, 1L, NA, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L), group2 = c(NA, 1L, 1L, NA, NA, 1L, NA, 1L, NA, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L)), class = "data.frame", row.names = c(NA,
-18L))
A solution that uses tidyr::pivot_wider():
library(dplyr)
data %>%
mutate(id = 1:n(), name = paste0("group", col2 + 1), value = 1) %>%
tidyr::pivot_wider() %>%
mutate(col2 = replace(col2, col1 == 2, 0),
across(starts_with("group"), replace, col1 == 2, 2)) %>%
select(-id)
# A tibble: 18 x 4
col1 col2 group1 group2
<int> <dbl> <dbl> <dbl>
1 1 0 1 NA
2 1 1 NA 1
3 1 1 NA 1
4 1 0 1 NA
5 1 0 1 NA
6 1 1 NA 1
7 1 0 1 NA
8 1 1 NA 1
9 1 0 1 NA
10 2 0 2 2
11 2 0 2 2
12 2 0 2 2
13 2 0 2 2
14 2 0 2 2
15 2 0 2 2
16 2 0 2 2
17 2 0 2 2
18 2 0 2 2
You can use ifelse to get group1 and group2.
transform(data
, group1 = ifelse(col1==2, 2, ifelse(col2==0, 1, NA))
, group2 = ifelse(col1==2, 2, ifelse(col2==1, 1, NA))
)
# col1 col2 group1 group2
#1 1 0 1 NA
#2 1 1 NA 1
#3 1 1 NA 1
#4 1 0 1 NA
#5 1 0 1 NA
#6 1 1 NA 1
#7 1 0 1 NA
#8 1 1 NA 1
#9 1 0 1 NA
#10 2 0 2 2
#11 2 1 2 2
#12 2 1 2 2
#13 2 0 2 2
#14 2 0 2 2
#15 2 1 2 2
#16 2 0 2 2
#17 2 1 2 2
#18 2 0 2 2

Randomly pick a value from a set of rows and add value to new row below

my R skills are not sufficient to tackle this issue so I'm hoping someone can help.
My data look like this:
head(human.players,25)
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
5
1
1
2
1
1
0
0
NA
2020
1
5
2
1
2
1
1
1
0
NA
2020
1
5
3
1
2
NA
NA
NA
NA
NA
2020
1
6
1
1
2
0
0
0
0
NA
2020
1
6
2
1
2
0
0
0
0
NA
2020
1
6
3
1
2
NA
NA
NA
NA
NA
2020
1
7
1
1
2
0
1
1
1
NA
2020
1
7
2
1
2
1
0
0
1
NA
2020
1
7
3
1
2
NA
NA
NA
NA
NA
2020
2
1
1
1
3
1
1
0
0
NA
2020
2
1
2
1
3
0
0
0
1
NA
2020
2
1
3
1
3
0
1
1
0
NA
Vars from q1_a:q5_a represent whether a player got the question wrong (0) or correct (1). Each player plays in a specific round (there are 7 rounds per episode). In the first 4 rounds, there are 3 players. However, in rounds 5-7, there are only 2 players (the one player that is eliminated has NA's - e.g., in episode 1, this is player 3 - see table above).
I need to create a random player. This means that for the first 4 rounds I need to randomly select an answer (for each of the 5 questions) from the three players within that round and add the "random player" row value. For rounds 5 to 7 I need to select an answer from the two players (ignoring the NA) and add the "random player" row value.
An algorithm of sorts would have to look at round 1 (only those rows), sample one value from the three rows, paste it into round 1 (i.e., create row 4 in this example ) and do that for each of the 5 questions. Then for round 2...
This is how it is supposed to look like where I've added player 4 - the random player:
Season
Episode
Round
Player
Player_type
Crowd_size
q1_a
q2_a
q3_a
q4_a
q5_a
2020
1
1
1
1
3
0
1
0
0
NA
2020
1
1
2
1
3
0
1
1
1
NA
2020
1
1
3
1
3
0
0
0
1
NA
2020
1
1
4
1
3
0
0
1
1
NA
2020
1
2
1
1
3
1
1
0
1
NA
2020
1
2
2
1
3
1
0
1
0
NA
2020
1
2
3
1
3
1
1
1
0
NA
2020
1
2
4
1
3
1
1
1
0
NA
2020
1
3
1
1
3
0
1
0
0
NA
2020
1
3
2
1
3
0
1
1
1
NA
2020
1
3
3
1
3
0
0
1
1
NA
2020
1
3
4
1
3
0
0
0
1
NA
2020
1
4
1
1
3
0
0
1
1
NA
2020
1
4
2
1
3
0
0
1
1
NA
2020
1
4
3
1
3
0
0
1
1
NA
2020
1
4
4
1
3
0
0
1
1
NA
Writing this, I'm thinking it may be impossible or at least very difficult to do this so this question is more like a "hail mary". I presume some combination of sample(), apply(), and creating a custom function is necessary, but I'm stumped.
Here's one pipe that samples the new players and their scores into a separate frame, which you can then bind_rows back into the original data.
set.seed(2021)
newplayers <- dat %>%
filter(!is.na(q1_a)) %>%
group_by(Season, Episode, Round) %>%
summarize(across(everything(), ~ sample(., size=1)), .groups = "drop") %>%
mutate(Player = NA_integer_, Player_type = NA_integer_)
newplayers
# # A tibble: 8 x 11
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <lgl>
# 1 2020 1 1 NA NA 3 0 0 1 1 NA
# 2 2020 1 2 NA NA 3 1 1 0 0 NA
# 3 2020 1 3 NA NA 3 0 1 1 0 NA
# 4 2020 1 4 NA NA 3 0 0 1 1 NA
# 5 2020 1 5 NA NA 2 1 1 1 0 NA
# 6 2020 1 6 NA NA 2 0 0 0 0 NA
# 7 2020 1 7 NA NA 2 0 0 1 1 NA
# 8 2020 2 1 NA NA 3 0 1 0 0 NA
bind_rows(dat, newplayers) %>%
arrange(Season, Episode, Round, is.na(Player), Player) %>%
head(.)
# Season Episode Round Player Player_type Crowd_size q1_a q2_a q3_a q4_a q5_a
# 1 2020 1 1 1 1 3 0 1 0 0 NA
# 2 2020 1 1 2 1 3 0 1 1 1 NA
# 3 2020 1 1 3 1 3 0 0 0 1 NA
# 4 2020 1 1 NA NA 3 0 0 1 1 NA
# 5 2020 1 2 1 1 3 1 1 0 1 NA
# 6 2020 1 2 2 1 3 1 0 1 0 NA
I didn't know what values to put into Player*, so I chose NA.
Data
# dput(dat)
dat <- structure(list(Season = c(2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L, 2020L), Episode = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L), Round = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L, 3L, 4L, 4L, 4L, 5L, 5L, 5L, 6L, 6L, 6L, 7L, 7L, 7L, 1L, 1L, 1L), Player = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), Player_type = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Crowd_size = c(3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L), q1_a = c(0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 0L, 1L, NA, 1L, 0L, 0L), q2_a = c(1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 1L, 0L, 1L), q3_a = c(0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, NA, 0L, 0L, NA, 1L, 0L, NA, 0L, 0L, 1L), q4_a = c(0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, NA, 0L, 0L, NA, 1L, 1L, NA, 0L, 1L, 0L), q5_a = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA, -24L))

R - fill value based on conditions in Person Period Format

I am struggling to find a simple way to fill values based on two simple conditions.
I am trying to fill the variable working with 1 after the first and last "1" for each dayweek. The example is more telling.
id hours dayweek working
1 1 1 Friday 0
2 1 2 Friday 0
3 1 3 Friday 0
4 1 4 Friday 0
5 1 5 Friday 0
6 1 6 Friday 0
7 1 7 Friday 0
8 1 8 Friday 1
9 1 9 Friday 0
10 1 10 Friday 0
11 1 11 Friday 0
12 1 12 Friday 0
13 1 13 Friday 0
14 1 14 Friday 0
15 1 15 Friday 0
16 1 16 Friday 0
17 1 17 Friday 1
18 1 18 Friday 0
19 1 19 Friday 0
20 1 20 Friday 0
I am trying to do this.
id hours dayweek working
1 1 1 Friday 0
2 1 2 Friday 0
3 1 3 Friday 0
4 1 4 Friday 0
5 1 5 Friday 0
6 1 6 Friday 0
7 1 7 Friday 0
8 1 8 Friday 1
9 1 9 Friday 1
10 1 10 Friday 1
11 1 11 Friday 1
12 1 12 Friday 1
13 1 13 Friday 1
14 1 14 Friday 1
15 1 15 Friday 1
16 1 16 Friday 1
17 1 17 Friday 1
18 1 18 Friday 0
19 1 19 Friday 0
20 1 20 Friday 0
The group_by must be id and dayweek.
Any clue ?
The data
structure(list(id = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = c("1",
"2", "3"), class = "factor"), hours = 1:20, dayweek = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), .Label = c("Friday", "Monday", "Saturday", "Sunday",
"Thursday", "Tuesday", "Wedesnday"), class = "factor"), working = c(0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0)), row.names = c(NA,
20L), class = "data.frame", .Names = c("id", "hours", "dayweek",
"working"))
alternative data of same problem
dt = structure(list(X = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 29L, 30L,
31L, 32L, 33L, 34L, 35L, 36L, 57L, 58L, 59L, 60L, 61L, 62L, 63L,
64L), id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 3L, 3L), hours = c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L), dayweek = structure(c(1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L), .Label = c("Friday", "Monday", "Saturday",
"Sunday", "Thursday", "Tuesday", "Wedesnday"), class = "factor"),
working = c(0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-24L), .Names = c("X", "id", "hours", "dayweek", "working"))
We could use data.table to do this. We convert the 'data.frame' to 'data.table' (setDT(df1)). Grouped by the 'id' and 'dayweek', we get the numeric index of elements in 'working' which is equal to 1 ('tmp') on if there is atleast an 1 value in the group (if(any(working==1))). Get the sequence (:) between first (head(tmp,1)) and last (tail(tmp, 1)) position and wrap it with .I to get the row index ('i1'). Use the index and assign the 'working' elements corresponding to that row as 1.
library(data.table)
i1 <- setDT(df1)[, if(any(working==1)){tmp <- which(working==1)
.I[head(tmp,1):tail(tmp,1)]} , by = .(id, dayweek)]$V1
df1[i1, working:=1L]
df1
# id hours dayweek working
# 1: 1 1 Friday 0
# 2: 1 2 Friday 0
# 3: 1 3 Friday 0
# 4: 1 4 Friday 0
# 5: 1 5 Friday 0
# 6: 1 6 Friday 0
# 7: 1 7 Friday 0
# 8: 1 8 Friday 1
# 9: 1 9 Friday 1
#10: 1 10 Friday 1
#11: 1 11 Friday 1
#12: 1 12 Friday 1
#13: 1 13 Friday 1
#14: 1 14 Friday 1
#15: 1 15 Friday 1
#16: 1 16 Friday 1
#17: 1 17 Friday 1
#18: 1 18 Friday 0
#19: 1 19 Friday 0
#20: 1 20 Friday 0
Or a similar solution using dplyr (as suggested by #David Arenburg) would be to group by 'id', 'dayweek' columns, use the min and max to get the first and last positions where working == 1, and replace those elements in working with 1. If there are no 1 value for a particular group, we can wrap with ifelse to return 0 for those group.
library(dplyr)
df1 %>%
group_by(id, dayweek) %>%
mutate(new = any(working ==1),
working = ifelse(new, replace(working,
min(which(working == 1)):max(which(working == 1)), 1),
as.numeric(new))) %>%
select(-new)
#Source: local data frame [20 x 4]
#Groups: id, dayweek
#
# id hours dayweek working
#1 1 1 Friday 0
#2 1 2 Friday 0
#3 1 3 Friday 0
#4 1 4 Friday 0
#5 1 5 Friday 0
#6 1 6 Friday 0
#7 1 7 Friday 0
#8 1 8 Friday 1
#9 1 9 Friday 1
#10 1 10 Friday 1
#11 1 11 Friday 1
#12 1 12 Friday 1
#13 1 13 Friday 1
#14 1 14 Friday 1
#15 1 15 Friday 1
#16 1 16 Friday 1
#17 1 17 Friday 1
#18 1 18 Friday 0
#19 1 19 Friday 0
#20 1 20 Friday 0
Or a compact option suggested by #Khashaa where we multiply the cummax of 'working' with cummax of reverse (rev) of 'working' so that only elements that are 1 in both the vectors remain as 1 while others will be replaced by 0.
df1 %>%
group_by(id, dayweek) %>%
mutate(working = cummax(working)*rev(cummax(rev(working))))

count frequency based on values in 2 or more columns

I have a pretty simple question but I can't think of a way to do this without using if statements
The data I have looks something like:
df <- structure(list(years = c(1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L,
1L, 2L, 3L, 1L, 2L, 3L, 1L, 2L, 3L), id = c(1L, 1L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L), x = structure(c(2L,
1L, 1L, 1L, 2L, 2L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 2L,
1L), .Label = c("E", "I"), class = "factor")), .Names = c("years",
"id", "x"), class = "data.frame", row.names = c(NA, -18L))
so the table looks like:
years id x
1 1 1 I
2 2 1 E
3 3 1 E
4 1 1 E
5 2 1 I
6 3 1 I
7 1 2 I
8 2 2 E
9 3 2 I
10 1 2 E
11 2 2 E
12 3 2 I
13 1 3 I
14 2 3 E
15 3 3 I
16 1 3 I
17 2 3 I
18 3 3 E
I would like the output to report the fraction of x's that are "I" for each id and each year:
years id xnew
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5
Any help would be greatly appreciated! Thank you!
aggregate(x ~ years + id, data=df, function(y) sum(y=="I")/length(y) )
years id x
1 1 1 0.5
2 2 1 0.5
3 3 1 0.5
4 1 2 0.5
5 2 2 0.0
6 3 2 1.0
7 1 3 1.0
8 2 3 0.5
9 3 3 0.5

Setting incomparables in place with merge

I'm seeing some unexpected behaviour with merge (or at least not entirely intuitive). But perhaps I'm just not understanding how it's supposed to work:
Let's create some dummy data to play with first:
x <- structure(list(A = c(2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L), B = c(2L, 2L, 1L, 2L,
1L, 1L, 1L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 1L, 1L
), C = c(2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 2L,
2L, 1L, 1L, 1L, 1L, 2L, 2L), D = c(2L, 1L, 2L, 2L, 2L, 1L, 1L,
2L, 1L, 2L, 1L, 2L, 1L, 1L, 2L, 1L, 2L, 1L, 1L, 1L), E = c(2L,
1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L,
1L, 1L, 1L), F = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L,
2L, 1L, 1L, 1L, 1L, 1L, 2L, 1L, 1L), G = c(2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 2L, 2L),
H = c(1L, 2L, 1L, 2L, 1L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L,
1L, 2L, 1L, 2L, 1L, 1L, 1L), I = c(1L, 1L, 2L, 2L, 2L, 1L,
1L, 1L, 2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 2L, 1L),
J = c(2L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 2L, 1L, 2L, 1L, 2L,
2L, 2L, 2L, 2L, 1L, 2L, 1L), K = c(3, 3, 1, 3, 1, 3, 1, 2,
2, 2, 1, 3, 2, 2, 2, 1, NA, 1, 2, 1)), .Names = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K"), row.names = c(NA,
20L), class = "data.frame")
# Generate Listing of All Possible Combinations
y <- list(1:2); y = expand.grid(rep(y,10));
colnames(y) <- LETTERS[1:10]
y <- rbind(y,y,y)
y$K <- rep(1:3,each=1024)
y$mergekey <- sample(1:6,3072,replace=TRUE)
My expectation is that when I merge these two data sets that setting sort=FALSE and all.x=TRUE would provide me with a list of all x in place with mergekey.
Let's try that:
merge(x,y,all.x=TRUE,sort=FALSE)
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 5
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 3
4 2 2 1 2 2 1 2 2 2 2 3 2
5 1 1 2 2 2 2 2 1 2 2 1 4
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 5
8 2 1 2 2 1 1 2 2 1 1 2 4
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 2
11 2 1 2 1 1 1 2 1 2 2 1 4
12 2 2 1 2 1 2 2 1 2 1 3 5
13 2 1 2 1 1 1 2 1 2 2 2 3
14 2 1 2 1 1 1 2 1 2 2 2 3
15 2 2 2 1 2 1 2 1 2 2 2 1
16 2 1 1 2 1 1 2 2 2 2 2 1
17 2 1 1 1 1 1 2 1 1 2 1 2
18 1 2 1 1 1 2 2 1 1 1 1 5
19 2 1 2 1 1 1 2 1 1 1 1 4
20 2 2 1 2 1 1 1 2 1 2 NA NA
Now it seems that "most of x is unsorted" but incomparables are pushed to the end, rather than maintaining their order.
So, my question is: How do I get the incomparables to stay in place?
PS: Does it not seem a little unintuitive to push incomparables to the end if the merge has been told not to sort? I don't find this congruent with this behaviour either
The join function in the plyr package solves this problem intuitively without additional arguements.
library(plyr)
join(x,y)
Joining by: A, B, C, D, E, F, G, H, I, J, K
A B C D E F G H I J K mergekey
1 2 2 2 2 2 1 2 1 1 2 3 4
2 2 2 1 1 1 1 2 2 1 1 3 3
3 2 1 2 2 1 1 2 1 2 2 1 5
4 2 2 1 2 2 1 2 2 2 2 3 3
5 1 1 2 2 2 2 2 1 2 2 1 6
6 2 1 1 1 2 2 2 2 1 2 3 6
7 1 1 1 1 2 2 2 2 1 2 1 4
8 2 1 2 2 1 1 2 2 1 1 2 2
9 2 2 2 1 1 1 2 1 2 2 2 4
10 2 1 2 2 1 1 2 1 1 1 2 6
11 2 1 2 1 1 1 2 1 2 2 1 1
12 2 2 1 2 1 2 2 1 2 1 3 3
13 2 1 2 1 1 1 2 1 2 2 2 2
14 2 2 2 1 2 1 2 1 2 2 2 6
15 2 1 1 2 1 1 2 2 2 2 2 2
16 2 1 1 1 1 1 2 1 1 2 1 3
17 2 2 1 2 1 1 1 2 1 2 NA NA
18 1 2 1 1 1 2 2 1 1 1 1 1
19 2 1 2 1 1 1 2 1 2 2 2 2
20 2 1 2 1 1 1 2 1 1 1 1 1

Resources