Ranges surrounding values in data frame R dplyr - r

I have a data frame that looks something like this :
test <- data.frame(chunk = c(rep("a",27),rep("b",27)), x = c(1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1))
There is a column by which I would like to group the data using group_by() in dplyr, which in this example is called chunk
I want to add another column to each chunk of test called x1 so the resulting data frame looks like this :
test1 <- data.frame(test, x1 = c(0,0,0,0,0,0,0,1,1,1,1,1,2,2,2,2,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,2,2,2,2,1,1,1,1,1,0,0,0,0,0,0))
x1 identifies all of the occurrences of 0 in x and takes a range of +-5 rows in each direction from the end 0s and adds an identifier. What the identifier is doesn't matter, but in this example the identifier in x1 is 1 for the range and 2 for the occurrences of 0 in x
Thanks for any and all help!

Here's an option to do it in dplyr:
Shorter version:
n <- 1:5
test %>%
group_by(chunk) %>%
mutate(x1 = ifelse((row_number() - min(which(x == 0))) %in% -n |
(row_number(chunk) - max(which(x == 0))) %in% n, 1, ifelse(x == 0, 2, 0)))
Longer (first) version:
test %>%
group_by(chunk) %>%
mutate(start = (row_number() - min(which(x == 0))) %in% -5:-1,
end = (row_number() - max(which(x == 0))) %in% 1:5,
x1 = ifelse(start | end, 1, ifelse(x == 0, 2, 0))) %>%
select(-c(start, end))
Source: local data frame [54 x 3]
Groups: chunk
chunk x x1
1 a 1 0
2 a 1 0
3 a 1 0
4 a 1 0
5 a 1 0
6 a 1 0
7 a 1 0
8 a 1 1
9 a 1 1
10 a 1 1
11 a 1 1
12 a 1 1
13 a 0 2
14 a 0 2
15 a 0 2
16 a 0 2
17 a 1 1
18 a 1 1
19 a 1 1
20 a 1 1
21 a 1 1
22 a 1 0
23 a 1 0
24 a 1 0
25 a 1 0
26 a 1 0
27 a 1 0
28 b 1 0
29 b 1 0
30 b 1 0
31 b 1 0
32 b 1 0
33 b 1 0
34 b 1 0
35 b 1 1
36 b 1 1
37 b 1 1
38 b 1 1
39 b 1 1
40 b 0 2
41 b 0 2
42 b 0 2
43 b 0 2
44 b 1 1
45 b 1 1
46 b 1 1
47 b 1 1
48 b 1 1
49 b 1 0
50 b 1 0
51 b 1 0
52 b 1 0
53 b 1 0
54 b 1 0
The assumption in this approach is, that in each group of "chunk" there is only one sequence of 0s (as in the sample data). Let me know if that's not the case in your actual data.

Related

Create an index variable for blocks of values

I have a dataframe "data" with a grouping variable "grp" and a binary classification variable "classif". For each group in grp, I want to create a "result" variable creating an index of separate blocks of 0 in the classif variable. For the time being, I don't know how to reset the count for each level of the grouping variable and I don't find a way to only create the index for blocks of 0s (ignoring the 1s).
Example data:
grp <- c(1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2,2,2,2,2,3,3,3,3,3,3,3,3,3)
classif <- c(0,1,0,0,1,0,0,1,1,0,0,0,0,1,0,1,1,1,0,0,1,1,0,0,0,1,0,1,0)
result <- c(1,0,2,2,0,3,3,0,0,1,1,1,1,0,2,0,0,0,3,3,0,0,1,1,1,0,2,0,3)
wrong_result <- c(1,2,3,3,4,5,5,1,1,2,2,2,2,3,4,5,5,5,6,6,1,1,2,2,2,3,4,5,6)
Data <- data.frame(grp,classif,result, wrong_result)
I have tried using rleid but the following command produces "wrong_result", which is not what I'm after.
data[, wrong_result:= rleid(classif)]
data[, wrong_result:= rleid(classif), by=grp]
With dplyr, use cumsum() and lag() to find blocks of zeroes .by group. (Make sure you’re using the latest version of dplyr to use the .by argument).
library(dplyr)
Data %>%
mutate(
result2 = ifelse(
classif == 0,
cumsum(classif == 0 & lag(classif, default = 1) == 1),
0
),
.by = grp
)
grp classif result result2
1 1 0 1 1
2 1 1 0 0
3 1 0 2 2
4 1 0 2 2
5 1 1 0 0
6 1 0 3 3
7 1 0 3 3
8 2 1 0 0
9 2 1 0 0
10 2 0 1 1
11 2 0 1 1
12 2 0 1 1
13 2 0 1 1
14 2 1 0 0
15 2 0 2 2
16 2 1 0 0
17 2 1 0 0
18 2 1 0 0
19 2 0 3 3
20 2 0 3 3
21 3 1 0 0
22 3 1 0 0
23 3 0 1 1
24 3 0 1 1
25 3 0 1 1
26 3 1 0 0
27 3 0 2 2
28 3 1 0 0
29 3 0 3 3
Use rle and sequentially number the runs produced and then convert back and zero out the runs of 1's. No packages are used.
seq0 <- function(x) {
r <- rle(x)
is0 <- r$values == 0
r$values[is0] <- seq_len(sum(is0))
inverse.rle(r) * !x
}
transform(Data, result2 = ave(classif, grp, FUN = seq0))

Creating a new column with conditions in addition to the row value of the new column

Any ideas on how to create a new column B using the values of column A,
while using the value of the row above of the new created colum B?
The value of B should be corresponding to:
A0 = value of the row above.
A1 = 1.
A2 = value of the row above + 1.
Current dataframe + desired outcome
Dataframe Desired outcome
A A B
1 1 1
0 0 1
2 2 2
0 0 2
2 2 3
0 0 3
2 2 4
0 0 4
2 2 5
0 0 5
2 2 6
0 0 6
1 1 1
0 0 1
1 1 1
0 0 1
2 2 2
0 0 2
2 2 3
0 0 3
1 1 1
0 0 1
2 2 2
0 0 2
Data Frame
A <- c(1,0,2,0,2,0,2,0,2,0,2,0,1,0,1,0,2,0,2,0,1,0,2,0)
Bdesiredoutcome <- c(1,1,2,2,3,3,4,4,5,5,6,6,1,1,1,1,2,2,3,3,1,1,2,2)
df = data.frame(A,Bdesiredoutcome)
I tried using dpylr, mutate(), case_when() and lag() but keep running into errors. Due to using the lag() function. When using lag(A) the desired outcome cannot be generated.
Any idea's on how to solve this problem?
df <- df %>%
mutate(B = case_when((A == 0) ~ lag(B),
(A == 1) ~ 1,
(A == 2) ~ (lag(B)+1)
))
Error in UseMethod("mutate_") :
no applicable method for 'mutate_' applied to an object of class "function"
In addition: Warning message:
We can create a grouping column with cumsum and then create the 'B' column
library(dplyr)
df %>%
group_by(grp = cumsum(A == 1)) %>%
mutate(B = cumsum(A != 0)) %>%
ungroup %>%
select(-grp) %>%
as.data.frame
-output
A Bdesired B
1 1 1 1
2 0 1 1
3 2 2 2
4 0 2 2
5 2 3 3
6 0 3 3
7 2 4 4
8 0 4 4
9 2 5 5
10 0 5 5
11 2 6 6
12 0 6 6
13 1 1 1
14 0 1 1
15 1 1 1
16 0 1 1
17 2 2 2
18 0 2 2
19 2 3 3
20 0 3 3
21 1 1 1
22 0 1 1
23 2 2 2
24 0 2 2
On your original question I got the following:
library(tidyverse)
library(lubridate)
df$date <-dmy(df$date)
df <- df %>%
arrange(id, date) %>%
group_by(id) %>%
mutate(daysbetween = replace_na(date - lag(date),0),
ind = 1,
NewA= case_when (daysbetween < 7 ~ 0, daysbetween > 7 ~ 1),
NewB= case_when (daysbetween < 85 ~ 0, daysbetween > 85 ~ 1),
A = case_when (1 + cumsum(ind*NewA) <= 6 ~ 1 + cumsum(ind*NewA),
1 + cumsum(ind*NewA) > 6 ~ 1 + cumsum(ind*NewA) - 6),
B = 1 + cumsum(ind*NewB))%>%
select(id, date, A, B)
It only works if the reset for A is at 6. I used cumsum() as suggested above.

Recoding by an order in r

I have a data recoding puzzle. Here is how my sample data looks like:
df <- data.frame(
id = c(1,1,1,1,1,1,1, 2,2,2,2,2,2, 3,3,3,3,3,3,3),
scores = c(0,1,1,0,0,-1,-1, 0,0,1,-1,-1,-1, 0,1,0,1,1,0,1),
position = c(1,2,3,4,5,6,7, 1,2,3,4,5,6, 1,2,3,4,5,6,7),
cat = c(1,1,1,1,1,0,0, 1,1,1,0,0,0, 1,1,1,1,1,1,1))
id scores position cat
1 1 0 1 1
2 1 1 2 1
3 1 1 3 1
4 1 0 4 1
5 1 0 5 1
6 1 -1 6 0
7 1 -1 7 0
8 2 0 1 1
9 2 0 2 1
10 2 1 3 1
11 2 -1 4 0
12 2 -1 5 0
13 2 -1 6 0
14 3 0 1 1
15 3 1 2 1
16 3 0 3 1
17 3 1 4 1
18 3 1 5 1
19 3 0 6 1
20 3 1 7 1
There are three ids in the dataset and rows were ordered by a positon variable. For each id, the first row after the scores start by -1 needs to be 0, and the cat variable needs to be 1. For example, for id=1, the first row would be 6th position and in that row, score should be 0 and the cat variable needs to 1. For those ids do not have scores=-1, I keep them as they are.
The desired output should look like below:
id scores position cat
1 1 0 1 1
2 1 1 2 1
3 1 1 3 1
4 1 0 4 1
5 1 0 5 1
6 1 0 6 1
7 1 -1 7 0
8 2 0 1 1
9 2 0 2 1
10 2 1 3 1
11 2 0 4 1
12 2 -1 5 0
13 2 -1 6 0
14 3 0 1 1
15 3 1 2 1
16 3 0 3 1
17 3 1 4 1
18 3 1 5 1
19 3 0 6 1
20 3 1 7 1
Any recommendations??
Thanks
This may be what you are after
df %>%
group_by(id) %>%
mutate(i = which(scores == -1)[1]) %>% # find the first row == -1
mutate(scores = case_when(position == i & scores !=0 ~ 0, T ~ scores), # update the score using position & i
cat = ifelse(scores == -1,0,1)) %>% # then update cat
select (-i) # remove I
After trying a few things and getting ideas from #Ricky and #e.matt, I came up with a solution.
df %>%
filter(scores == -1) %>% # keep cases where var = 1
distinct(id, .keep_all = T) %>% # keep distinct cases based on group
mutate(first = 1) %>% # create first column
right_join(df, by=c("id","scores","position","cat")) %>% # join back original dataset
mutate(first = coalesce(first, 0)) %>% # replace NAs with 0
mutate(scores = case_when(
first == 1 ~ 0,
TRUE~scores)) %>%
mutate(cat = case_when(
first == 1 ~ 1,
TRUE~cat))
This provides my desired output.
id scores position cat first
1 1 0 1 1 0
2 1 1 2 1 0
3 1 1 3 1 0
4 1 0 4 1 0
5 1 0 5 1 0
6 1 0 6 1 1
7 1 -1 7 0 0
8 2 0 1 1 0
9 2 0 2 1 0
10 2 1 3 1 0
11 2 0 4 1 1
12 2 -1 5 0 0
13 2 -1 6 0 0
14 3 0 1 1 0
15 3 1 2 1 0
16 3 0 3 1 0
17 3 1 4 1 0
18 3 1 5 1 0
19 3 0 6 1 0
20 3 1 7 1 0
here is a data.table oneliner
library( data.table )
setDT(df)
df[ df[, .(cumsum( scores == -1 ) == 1), by = .(id)]$V1, `:=`( scores = 0, cat = 1) ]
# id scores position cat
# 1: 1 0 1 1
# 2: 1 1 2 1
# 3: 1 1 3 1
# 4: 1 0 4 1
# 5: 1 0 5 1
# 6: 1 0 6 1
# 7: 1 -1 7 0
# 8: 2 0 1 1
# 9: 2 0 2 1
# 10: 2 1 3 1
# 11: 2 0 4 1
# 12: 2 -1 5 0
# 13: 2 -1 6 0
# 14: 3 0 1 1
# 15: 3 1 2 1
# 16: 3 0 3 1
# 17: 3 1 4 1
# 18: 3 1 5 1
# 19: 3 0 6 1
# 20: 3 1 7 1
You could do something along these lines using the dplyr package:
library(dplyr)
df = mutate(df, cat = ifelse(scores == -1, 1, cat),
scores = ifelse(scores == -1, 0, scores))
Using the mutate() function, I am re-assigning the values for the scores and cat fields according to ifelse() conditional statements. For scores, if the score is -1, the value is replaced by 0, otherwise it keeps the score as is. For cat, it also checks if scores is equal to -1, but would assign a value of 1 when the condition is met, or the already existing value of cat when the condition is not met.
EDIT
After our discussion in the comments, I think something along these lines should be helpful (you may have to modify the logic since I don't exactly follow what the desired output is here):
for(i in 1:nrow(df)){
# Check if score is -1
if(df[i, 'scores'] == -1){
# Update values for the next row
df[i+1, 'scores'] <- 0
df[i+1, 'cat'] <- 1
}
}
Sorry that I don't really follow the desired output, hopefully this is helpful in getting you to your answer!

Generate a new variable based on values change in another variable r

I asked something very similar [enter link description here][1] but I have a better understanding of my problem now. I will try my best to ask it as clear as I can.
I have a sample dataset looks like this below:
id <- c(1,1,1, 2,2,2, 3,3, 4,4, 5,5,5,5, 6,6,6, 7, 8,8, 9,9, 10,10)
item.id <- c(1,1,2, 1,1,1 ,1,1, 1,2, 1,2,2,2, 1,1,1, 1, 1,2, 1,1, 1,1)
sequence <- c(1,2,1, 1,2,3, 1,2, 1,1, 1,1,2,3, 1,2,3, 1, 1,1, 1,2, 1,2)
score <- c(0,0,0, 0,0,1, 2,0, 1,1, 1,0,1,1, 0,0,0, 1, 0,2, 1,2, 2,1)
data <- data.frame("id"=id, "item.id"=item.id, "sequence"=sequence, "score"=score)
> data
id item.id sequence score
1 1 1 1 0
2 1 1 2 0
3 1 2 1 0
4 2 1 1 0
5 2 1 2 0
6 2 1 3 1
7 3 1 1 2
8 3 1 2 0
9 4 1 1 1
10 4 2 1 1
11 5 1 1 1
12 5 2 1 0
13 5 2 2 1
14 5 2 3 1
15 6 1 1 0
16 6 1 2 0
17 6 1 3 0
18 7 1 1 1
19 8 1 1 0
20 8 2 1 2
21 9 1 1 1
22 9 1 2 2
23 10 1 1 2
24 10 1 2 1
id represents for each student, item.id represents the questions students take, sequence is the attempt number for each item.id, and score is the score for each attempt, taking 0,1, or 2. Students can change their answers.
For item.id within each id, I create a variable (status) by looking at the last two sequences (changes): Here the recoding rules are for status:
1-If there is only one attempt for each question:
a) assign "BTW" (Blank to Wrong) if the item score is 0.
b) assign "BTW" (Blank to Right) if the item score is 1.
2-If there are multiple attempts for each question:
a) assign "BTW" (Blank to Wrong) if the first item attempt score is 0.
b) assign "BTW" (Blank to Right) if the first item attempt score is 1.
c) assign "WW" for those who changed from wrong to wrong (0 to 0),
d) assign "WR" for those who changed to increasing score (0 to 1, or 1 to 2),
e) assign "RW" for those who changed to decreasing score (2 to 1, 2 to 0, or 1 to 0 ), and
f) assign "RR" for those who changed from right to right (1 to 1, 2 to 2).
score change from 0 to 1 or 0 to 2 or 1 to 2 considered correct (right) change while,
score change from 1 to 0 or 2 to 0 or 2 to 1 considered incorrect (wrong) change.
If there is only one attempt for item.id as in id=7, then the status should be "BTR". If the score was 0, then it should be "BTW". the logic is supposed to be if the score increases, it should be WR, if it decreases, it should be RW.
a) from 1 to 2 as WR, instead, they were coded as RR,
b) from 2 to 1 as RW, instead, they were coded as WW.
I used this code. Things did not work out for some, for example for id=1. The status should be {BTW, WW}.
library(dplyr)
data %>% group_by(id,item.id) %>%
mutate(diff = c(0, diff(score)),
status = case_when(
n() == 1 & score == 0 ~ "BTW",
n() == 1 & score == 1 ~ "BTR",
diff == 0 & score == 0 ~ "WW",
diff == 0 & score > 0 ~ "RR",
diff > 0 ~ "WR",
diff < 0 ~ "RW",
TRUE ~ "oops"))
> data
id item.id sequence score diff status
1 1 1 1 0 0 WW
2 1 1 2 0 0 WW
3 1 2 1 0 0 BTW
4 2 1 1 0 0 WW
5 2 1 2 0 0 WW
6 2 1 3 1 1 WR
7 3 1 1 2 0 RR
8 3 1 2 0 -2 RW
9 4 1 1 1 0 BTR
10 4 2 1 1 0 BTR
11 5 1 1 1 0 BTR
12 5 2 1 0 0 WW
13 5 2 2 1 1 WR
14 5 2 3 1 0 RR
15 6 1 1 0 0 WW
16 6 1 2 0 0 WW
17 6 1 3 0 0 WW
18 7 1 1 1 0 BTR
19 8 1 1 0 0 BTW
20 8 2 1 2 0 RR
21 9 1 1 1 0 RR
22 9 1 2 2 1 WR
23 10 1 1 2 0 RR
24 10 1 2 1 -1 RW
the desired output would be with cases:
> desired
id item.id sequence score status
1 1 1 1 0 BTW
2 1 1 2 0 WW
3 1 2 1 0 BTW
4 2 1 1 0 BTW
5 2 1 2 0 WW
6 2 1 3 1 WR
7 3 1 1 2 BTR
8 3 1 2 0 RW
9 4 1 1 1 BTR
10 4 2 1 1 BTR
11 5 1 1 1 BTR
12 5 2 1 0 BTW
13 5 2 2 1 WR
14 5 2 3 1 RR
15 6 1 1 0 BTW
16 6 1 2 0 WW
17 6 1 3 0 WW
18 7 1 1 1 BTR
19 8 1 1 0 BTW
20 8 2 1 2 BTR
21 9 1 1 1 BTR
22 9 1 2 2 RR
23 10 1 1 2 BTR
24 10 1 2 1 RW
Any opinions?
Thanks!
In order to solve this, I broke the problem down into two steps. First identify the Blank to answer lines. Then once the first tries are identified then assign the change of answers to the remaining lines.
#rows that are not the first answer are assigned a "NA"
test<-data %>% group_by(id,item.id) %>%
mutate(status = case_when(
sequence == 1 & score == 0 ~ "BTW",
sequence == 1 & score >0 ~ "BTR",
TRUE ~ "NA"))
answer<- test %>% ungroup() %>% group_by(id, item.id) %>%
transmute(sequence, score,
status = case_when(score == 0 & score==lag(score) & status=="NA" ~ "WW",
score >= 1 & score == lag(score) & status=="NA"~ "RR",
score > 0 & score > lag(score) & status=="NA"~ "WR",
score < lag(score) & status=="NA"~ "RW",
TRUE ~ status))
head(answer, 20)
tail(answer, 4)
The status column matches your sample data for all rows except row 20, please double check the calculation.

Counter max frequency non consecutive numbers

I have some data where one of the variables is an accountant with some requirements. What I need to know now is how many times that counter reaches 1 for each ID, if there are several 1's in a row, you only have to count 1.
For example, let's say that the ID has counter: 1, 0, 0, 1, 1, 0, 0, 1,1,1,0,0. I would have to say that the id has 3 of frequency.
Frec_counter count the number of non-consecutive times that a 1. appears. If there are consecutive 1's, the last one is numbered.
My data:
id <- c(10,10,10,10,10,11,11,11,11,11,11,12,12,12,13, 13, 15, 14)
counter <- c(0,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,1)
DF <- data.frame(id, counter); DF
Id 10 has 0,0,1,1,0.
5 data, but only 1 non-consecutive, so it is set to frec_counter 0,0,0,1,0
My desirable output:
id <- c(10,10,10,10,10,11,11,11,11,11,11,12,12,12,13, 13, 15, 14)
counter <- c(0,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,1)
frec_counter <- c(0,0,0,1,0,1,0,2,0,0,3,0,0,1,0,0,1,1)
max_counter <- c(1,1,1,1,1,3,3,3,3,3,3,1,1,1,0,0,1,1)
DF <- data.frame(id, counter, frec_counter, max_counter); DF
Here is one approach using tidyverse:
library(tidyverse)
DF %>%
group_by(id) %>% #group by id
mutate(one = ifelse(counter == lead(counter), 0, counter) #if the leading value is the same replace the value with 0
one = ifelse(is.na(one), counter, one), #to handle last in group where lead results in NA
frec_counter1 = cumsum(one), #get cumulative sum of 1s
frec_counter1 = ifelse(one == 0, 0 , frec_counter1), #replace the cumsum values with 0 where approprate
max_counter1 = max(frec_counter1)) %>% #get the max frec_counter1 per group
select(-one) #remove dummy variable
#output
id counter frec_counter max_counter frec_counter1 max_counter1
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 10 0 0 1 0 1
2 10 0 0 1 0 1
3 10 1 0 1 0 1
4 10 1 1 1 1 1
5 10 0 0 1 0 1
6 11 1 1 3 1 3
7 11 0 0 3 0 3
8 11 1 2 3 2 3
9 11 0 0 3 0 3
10 11 1 0 3 0 3
11 11 1 3 3 3 3
12 12 1 0 1 0 1
13 12 1 0 1 0 1
14 12 1 1 1 1 1
15 13 0 0 0 0 0
16 13 0 0 0 0 0
17 15 1 1 1 1 1
18 14 1 1 1 1 1
Your data:
id <- c(10,10,10,10,10,11,11,11,11,11,11,12,12,12,13, 13, 15, 14)
counter <- c(0,0,1,1,0,1,0,1,0,1,1,1,1,1,0,0,1,1)
DF <- data.frame(id, counter)
id counter
1 10 0
2 10 0
3 10 1
4 10 1
5 10 0
6 11 1
7 11 0
8 11 1
9 11 0
10 11 1
11 11 1
12 12 1
13 12 1
14 12 1
15 13 0
16 13 0
17 15 1
18 14 1
If all you wanted was the final counts, we could do that in base R:
counts <- with(DF, split(counter, id))
lengths <- lapply(counts, rle)
final <- lapply(lengths, function(x) sum(x$values == 1))
$`10`
[1] 1
$`11`
[1] 3
$`12`
[1] 1
$`13`
[1] 0
$`14`
[1] 1
$`15`
[1] 1
But since you specifically want a data frame with the intermediary "flags", the tidyverse set of packages works better:
library(tidyverse)
df.new <- DF %>%
group_by(id) %>%
mutate(
frec_counter = counter == 1 & (is.na(lead(counter)) | lead(counter == 0)),
frec_counter = as.numeric(frec_counter),
max_counter = sum(frec_counter)
)
# A tibble: 18 x 4
# Groups: id [6]
id counter frec_counter max_counter
<dbl> <dbl> <dbl> <dbl>
1 10 0 0 1
2 10 0 0 1
3 10 1 0 1
4 10 1 1 1
5 10 0 0 1
6 11 1 1 3
7 11 0 0 3
8 11 1 1 3
9 11 0 0 3
10 11 1 0 3
11 11 1 1 3
12 12 1 0 1
13 12 1 0 1
14 12 1 1 1
15 13 0 0 0
16 13 0 0 0
17 15 1 1 1
18 14 1 1 1

Resources