identify whenever values repeat in r - r

I have a dataframe like this.
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3))
I want to populate a new variable Sequence which identifies whenever Condition starts again from 1.
So the new dataframe would look like this.
Thanks in advance for the help!
data <- data.frame(Condition = c(1,1,2,3,1,1,2,2,2,3,1,1,2,3,3),
Sequence = c(1,1,1,1,2,2,2,2,2,2,3,3,3,3,3))

base R
data$Sequence2 <- cumsum(c(TRUE, data$Condition[-1] == 1 & data$Condition[-nrow(data)] != 1))
data
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3
dplyr
library(dplyr)
data %>%
mutate(
Sequence2 = cumsum(Condition == 1 & lag(Condition != 1, default = TRUE))
)
# Condition Sequence Sequence2
# 1 1 1 1
# 2 1 1 1
# 3 2 1 1
# 4 3 1 1
# 5 1 2 2
# 6 1 2 2
# 7 2 2 2
# 8 2 2 2
# 9 2 2 2
# 10 3 2 2
# 11 1 3 3
# 12 1 3 3
# 13 2 3 3
# 14 3 3 3
# 15 3 3 3

This took a while. Finally I find this solution:
library(dplyr)
data %>%
group_by(Sequnce = cumsum(
ifelse(Condition==1, lead(Condition)+1, Condition)
- Condition==1)
)
Condition Sequnce
<dbl> <int>
1 1 1
2 1 1
3 2 1
4 3 1
5 1 2
6 1 2
7 2 2
8 2 2
9 2 2
10 3 2
11 1 3
12 1 3
13 2 3
14 3 3
15 3 3

Related

Creating an indexed column in R, grouped by user_id, and not increase when NA

I want to create a column (in R) that indexes the presence of a number in another column grouped by a user_id column. And when the other column is NA, the new desired column should not increase.
The example should bring clarity.
I have this df:
data <- data.frame(user_id = c(1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3),
one=c(1,NA,3,2,NA,0,NA,4,3,4,NA))
user_id tobeindexed
1 1 1
2 1 NA
3 1 3
4 2 2
5 2 NA
6 2 0
7 2 NA
8 3 4
9 3 3
10 3 4
11 3 NA
I want to make a new column looking like "desired" in the following df:
> cbind(data,data.frame(desired = c(1,1,2,1,1,2,2,1,2,3,3)))
user_id tobeindexed desired
1 1 1 1
2 1 NA 1
3 1 3 2
4 2 2 1
5 2 NA 1
6 2 0 2
7 2 NA 2
8 3 4 1
9 3 3 2
10 3 4 3
11 3 NA 3
How can I solve this?
Using colsum and group_by gets me close, but the count does not start over from 1 when the user_id changes...
> data %>% group_by(user_id) %>% mutate(desired = cumsum(!is.na(tobeindexed)))
user_id tobeindexed desired
<dbl> <dbl> <int>
1 1 1 1
2 1 NA 1
3 1 3 2
4 2 2 3
5 2 NA 3
6 2 0 4
7 2 NA 4
8 3 4 5
9 3 3 6
10 3 4 7
11 3 NA 7
Given the sample data you provided (with the one) column, this works unchanged. The code is retained below for demonstration.
base R
data$out <- ave(data$one, data$user_id, FUN = function(z) cumsum(!is.na(z)))
data
# user_id one out
# 1 1 1 1
# 2 1 NA 1
# 3 1 3 2
# 4 2 2 1
# 5 2 NA 1
# 6 2 0 2
# 7 2 NA 2
# 8 3 4 1
# 9 3 3 2
# 10 3 4 3
# 11 3 NA 3
dplyr
library(dplyr)
data %>%
group_by(user_id) %>%
mutate(out = cumsum(!is.na(one))) %>%
ungroup()
# # A tibble: 11 × 3
# user_id one out
# <dbl> <dbl> <int>
# 1 1 1 1
# 2 1 NA 1
# 3 1 3 2
# 4 2 2 1
# 5 2 NA 1
# 6 2 0 2
# 7 2 NA 2
# 8 3 4 1
# 9 3 3 2
# 10 3 4 3
# 11 3 NA 3

Adding sequence of numbers to the data

Hi I have a data frame like this
df <-data.frame(x=rep(rep(seq(0,3),each=2),2 ),gr=gl(2,8))
x gr
1 0 1
2 0 1
3 1 1
4 1 1
5 2 1
6 2 1
7 3 1
8 3 1
9 0 2
10 0 2
11 1 2
12 1 2
13 2 2
14 2 2
15 3 2
16 3 2
I want to add a new column numbering sequence of numbers when the x value ==0
I tried
library(dplyr)
df%>%
group_by(gr)%>%
mutate(numbering=seq(2,8,2))
Error in mutate_impl(.data, dots) :
Column `numbering` must be length 8 (the group size) or one, not 4
?
Just for side note mutate(numbering=rep(seq(2,8,2),each=2)) would work for this minimal example but for the general case its better to look x value change from 0!
the expected output
x gr numbering
1 0 1 2
2 0 1 2
3 1 1 4
4 1 1 4
5 2 1 6
6 2 1 6
7 3 1 8
8 3 1 8
9 0 2 2
10 0 2 2
11 1 2 4
12 1 2 4
13 2 2 6
14 2 2 6
15 3 2 8
16 3 2 8
Do you mean something like this?
library(tidyverse);
df %>%
group_by(gr) %>%
mutate(numbering = cumsum(c(1, diff(x) != 0)))
## A tibble: 16 x 3
## Groups: gr [2]
# x gr numbering
# <int> <fct> <dbl>
# 1 0 1 1.
# 2 0 1 1.
# 3 1 1 2.
# 4 1 1 2.
# 5 2 1 3.
# 6 2 1 3.
# 7 3 1 4.
# 8 3 1 4.
# 9 0 2 1.
#10 0 2 1.
#11 1 2 2.
#12 1 2 2.
#13 2 2 3.
#14 2 2 3.
#15 3 2 4.
#16 3 2 4.
Or if you must have a numbering sequence 2,4,6,... instead of 1,2,3,... you can do
df %>%
group_by(gr) %>%
mutate(numering = 2 * cumsum(c(1, diff(x) != 0)));
## A tibble: 16 x 3
## Groups: gr [2]
# x gr numering
# <int> <fct> <dbl>
# 1 0 1 2.
# 2 0 1 2.
# 3 1 1 4.
# 4 1 1 4.
# 5 2 1 6.
# 6 2 1 6.
# 7 3 1 8.
# 8 3 1 8.
# 9 0 2 2.
#10 0 2 2.
#11 1 2 4.
#12 1 2 4.
#13 2 2 6.
#14 2 2 6.
#15 3 2 8.
#16 3 2 8.
Here is an option using match to get the index and then pass on the seq values to fill
df %>%
group_by(gr) %>%
mutate(numbering = seq(2, length.out = n()/2, by = 2)[match(x, unique(x))])
# A tibble: 16 x 3
# Groups: gr [2]
# x gr numbering
# <int> <fct> <dbl>
# 1 0 1 2
# 2 0 1 2
# 3 1 1 4
# 4 1 1 4
# 5 2 1 6
# 6 2 1 6
# 7 3 1 8
# 8 3 1 8
# 9 0 2 2
#10 0 2 2
#11 1 2 4
#12 1 2 4
#13 2 2 6
#14 2 2 6
#15 3 2 8
#16 3 2 8

Delete rows with value if not only value in group

Somewhat new to R and I find myself needing to delete rows based on multiple criteria. The data frame has 3 columns and I need to delete rows where bid=99 and there are values less than 99 grouping by rid and qid. The desired output at an rid and qid level are bid has multiple values less than 99 or bid=99.
rid qid bid
1 1 5
1 1 6
1 1 99
1 2 6
2 1 7
2 1 99
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
3 2 99
4 1 2
4 1 6
4 2 1
4 2 2
4 2 99
5 1 99
5 2 99
The expected output...
rid qid bid
1 1 5
1 1 6
1 2 6
2 1 7
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
4 1 2
4 1 6
4 2 1
4 2 2
5 1 99
5 2 99
Any assistance would be appreciated.
You can use the base R function ave to generate a dropping variable like this:
df$dropper <- with(df, ave(bid, rid, qid, FUN= function(i) i == 99 & length(i) > 1))
ave calculates a function on bid, grouping by rid and qid. The function tests if each element of the grouped bid values i is 99 and if i has a length greater than 1. Also, with is used to reduce typing.
which returns
df
rid qid bid dropper
1 1 1 5 0
2 1 1 6 0
3 1 1 99 1
4 1 2 6 0
5 2 1 7 0
6 2 1 99 1
7 2 2 2 0
8 2 2 3 0
9 3 1 7 0
10 3 1 8 0
11 3 2 1 0
12 3 2 99 1
13 4 1 2 0
14 4 1 6 0
15 4 2 1 0
16 4 2 2 0
17 4 2 99 1
18 5 1 99 0
19 5 2 99 0
then drop the undesired observations with df[dropper == 0, 1:3] which will simultaneously drop the new variable.
If you want to just delete rows where bid = 99 then use dplyr.
library(dplyr)
df <- df %>%
filter(bid != 99)
Where df is your data frame. and != means not equal to
Updated solution using dplyr
df %>%
group_by(rid, qid) %>%
mutate(tempcount = n())%>%
ungroup() %>%
mutate(DropValue =ifelse(bid == 99 & tempcount > 1, 1,0) ) %>%
filter(DropValue == 0) %>%
select(rid,qid,bid)
Here is another option with all and if condition in data.table to subset the rows after grouping by 'rid' and 'qid'
library(data.table)
setDT(df1)[, if(all(bid==99)) .SD else .SD[bid!= 99], .(rid, qid)]
# rid qid bid
# 1: 1 1 5
# 2: 1 1 6
# 3: 1 2 6
# 4: 2 1 7
# 5: 2 2 2
# 6: 2 2 3
# 7: 3 1 7
# 8: 3 1 8
# 9: 3 2 1
#10: 4 1 2
#11: 4 1 6
#12: 4 2 1
#13: 4 2 2
#14: 5 1 99
#15: 5 2 99
Or without using the if
setDT(df1)[df1[, .I[all(bid==99) | bid != 99], .(rid, qid)]$V1]
Here is a solution using dplyr, which is a very expressive framework for this kind of problems.
df <- read.table(text =
" rid qid bid
1 1 5
1 1 6
1 1 99
1 2 6
2 1 7
2 1 99
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
3 2 99
4 1 2
4 1 6
4 2 1
4 2 2
4 2 99
5 1 99
5 2 99",
header = TRUE, stringsAsFactors = FALSE)
Dplyr verbs allow to express the program in a way that is close to the very terms of your questions:
library(dplyr)
res <-
df %>%
group_by(rid, qid) %>%
filter(!(any(bid < 99) & bid == 99)) %>%
ungroup()
# # A tibble: 15 × 3
# rid qid bid
# <int> <int> <int>
# 1 1 1 5
# 2 1 1 6
# 3 1 2 6
# 4 2 1 7
# 5 2 2 2
# 6 2 2 3
# 7 3 1 7
# 8 3 1 8
# 9 3 2 1
# 10 4 1 2
# 11 4 1 6
# 12 4 2 1
# 13 4 2 2
# 14 5 1 99
# 15 5 2 99
Let's check we get the desired output:
desired_output <- read.table(text =
" rid qid bid
1 1 5
1 1 6
1 2 6
2 1 7
2 2 2
2 2 3
3 1 7
3 1 8
3 2 1
4 1 2
4 1 6
4 2 1
4 2 2
5 1 99
5 2 99",
header = TRUE, stringsAsFactors = FALSE)
identical(as.data.frame(res), desired_output)
# [1] TRUE

How do I add a vector where I collapse scores from individuals within pairs?

I have done an experiment in which participants have solved a task in pairs, with another participant. Each participant has then received a score for how well they did the task. Pairs have gone through different amounts of trials.
I have a data frame similar to the one below:
participant <- c(1,1,2,2,3,3,3,4,4,4,5,6)
pair <- c(1,1,1,1,2,2,2,2,2,2,3,3)
trial <- c(1,2,1,2,1,2,3,1,2,3,1,1)
score <- c(2,3,6,3,4,7,3,1,8,5,4,3)
data <- data.frame(participant, pair, trial, score)
participant pair trial score
1 1 1 2
1 1 2 3
2 1 1 6
2 1 2 3
3 2 1 4
3 2 2 7
3 2 3 3
4 2 1 1
4 2 2 8
4 2 3 5
5 3 1 4
6 3 1 3
I would like to add a new vector to the data frame, where each participant gets the numeric difference between their own score and the other participant's score within each trial.
Does someone have an idea about how one might do that?
It should end up looking something like this:
participant pair trial score difference
1 1 1 2 4
1 1 2 3 0
2 1 1 6 4
2 1 2 3 0
3 2 1 4 3
3 2 2 7 1
3 2 3 3 2
4 2 1 1 3
4 2 2 8 1
4 2 3 5 2
5 3 1 4 1
6 3 1 3 1
Here's a solution that involves first reordering data such that each sequential pair of rows corresponds to a single pair within a single trial. This allows us to make a single call to diff() to extract the differences:
data <- data[order(data$trial,data$pair,data$participant),];
data$diff <- rep(diff(data$score)[c(T,F)],each=2L)*c(-1L,1L);
data;
## participant pair trial score diff
## 1 1 1 1 2 -4
## 3 2 1 1 6 4
## 5 3 2 1 4 3
## 8 4 2 1 1 -3
## 11 5 3 1 4 1
## 12 6 3 1 3 -1
## 2 1 1 2 3 0
## 4 2 1 2 3 0
## 6 3 2 2 7 -1
## 9 4 2 2 8 1
## 7 3 2 3 3 -2
## 10 4 2 3 5 2
I assumed you wanted the sign to capture the direction of the difference. So, for instance, if a participant has a score 4 points below the other participant in the same trial-pair, then I assumed you would want -4. If you want all-positive values, you can remove the multiplication by c(-1L,1L) and add a call to abs():
data$diff <- rep(abs(diff(data$score)[c(T,F)]),each=2L);
data;
## participant pair trial score diff
## 1 1 1 1 2 4
## 3 2 1 1 6 4
## 5 3 2 1 4 3
## 8 4 2 1 1 3
## 11 5 3 1 4 1
## 12 6 3 1 3 1
## 2 1 1 2 3 0
## 4 2 1 2 3 0
## 6 3 2 2 7 1
## 9 4 2 2 8 1
## 7 3 2 3 3 2
## 10 4 2 3 5 2
Here's a solution built around ave() that doesn't require reordering the whole data.frame first:
data$diff <- ave(data$score,data$trial,data$pair,FUN=function(x) abs(diff(x)));
data;
## participant pair trial score diff
## 1 1 1 1 2 4
## 2 1 1 2 3 0
## 3 2 1 1 6 4
## 4 2 1 2 3 0
## 5 3 2 1 4 3
## 6 3 2 2 7 1
## 7 3 2 3 3 2
## 8 4 2 1 1 3
## 9 4 2 2 8 1
## 10 4 2 3 5 2
## 11 5 3 1 4 1
## 12 6 3 1 3 1
Here's how you can get the score of the other participant in the same trial-pair:
data$other <- ave(data$score,data$trial,data$pair,FUN=rev);
data;
## participant pair trial score other
## 1 1 1 1 2 6
## 2 1 1 2 3 3
## 3 2 1 1 6 2
## 4 2 1 2 3 3
## 5 3 2 1 4 1
## 6 3 2 2 7 8
## 7 3 2 3 3 5
## 8 4 2 1 1 4
## 9 4 2 2 8 7
## 10 4 2 3 5 3
## 11 5 3 1 4 3
## 12 6 3 1 3 4
Or, assuming the data.frame has been reordered as per the initial solution:
data$other <- c(rbind(data$score[c(F,T)],data$score[c(T,F)]));
data;
## participant pair trial score other
## 1 1 1 1 2 6
## 3 2 1 1 6 2
## 5 3 2 1 4 1
## 8 4 2 1 1 4
## 11 5 3 1 4 3
## 12 6 3 1 3 4
## 2 1 1 2 3 3
## 4 2 1 2 3 3
## 6 3 2 2 7 8
## 9 4 2 2 8 7
## 7 3 2 3 3 5
## 10 4 2 3 5 3
Alternative, using matrix() instead of rbind():
data$other <- c(matrix(data$score,2L)[2:1,]);
data;
## participant pair trial score other
## 1 1 1 1 2 6
## 3 2 1 1 6 2
## 5 3 2 1 4 1
## 8 4 2 1 1 4
## 11 5 3 1 4 3
## 12 6 3 1 3 4
## 2 1 1 2 3 3
## 4 2 1 2 3 3
## 6 3 2 2 7 8
## 9 4 2 2 8 7
## 7 3 2 3 3 5
## 10 4 2 3 5 3
Here is an option using data.table:
library(data.table)
setDT(data)[,difference := abs(diff(score)), by = .(pair, trial)]
data
# participant pair trial score difference
# 1: 1 1 1 2 4
# 2: 1 1 2 3 0
# 3: 2 1 1 6 4
# 4: 2 1 2 3 0
# 5: 3 2 1 4 3
# 6: 3 2 2 7 1
# 7: 3 2 3 3 2
# 8: 4 2 1 1 3
# 9: 4 2 2 8 1
#10: 4 2 3 5 2
#11: 5 3 1 4 1
#12: 6 3 1 3 1
A slightly faster option would be:
setDT(data)[, difference := abs((score - shift(score))[2]) , by = .(pair, trial)]
If we need the value of the other pair:
data[, other:= rev(score) , by = .(pair, trial)]
data
# participant pair trial score difference other
# 1: 1 1 1 2 4 6
# 2: 1 1 2 3 0 3
# 3: 2 1 1 6 4 2
# 4: 2 1 2 3 0 3
# 5: 3 2 1 4 3 1
# 6: 3 2 2 7 1 8
# 7: 3 2 3 3 2 5
# 8: 4 2 1 1 3 4
# 9: 4 2 2 8 1 7
#10: 4 2 3 5 2 3
#11: 5 3 1 4 1 3
#12: 6 3 1 3 1 4
Or using dplyr:
library(dplyr)
data %>%
group_by(pair, trial) %>%
mutate(difference = abs(diff(score)))
# participant pair trial score difference
# <dbl> <dbl> <dbl> <dbl> <dbl>
#1 1 1 1 2 4
#2 1 1 2 3 0
#3 2 1 1 6 4
#4 2 1 2 3 0
#5 3 2 1 4 3
#6 3 2 2 7 1
#7 3 2 3 3 2
#8 4 2 1 1 3
#9 4 2 2 8 1
#10 4 2 3 5 2
#11 5 3 1 4 1
#12 6 3 1 3 1

Count with table() and exclude 0's

I try to count triplets; for this I use three vectors that are packed in a dataframe:
X=c(4,4,4,4,4,4,4,4,1,1,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,3,3)
Y=c(1,1,1,1,1,1,1,1,1,1,1,1,2,2,3,4,2,2,2,2,3,4,1,1,2,2,3,3,4,4)
Z=c(4,4,5,4,4,4,4,4,6,1,1,1,1,1,1,1,2,2,2,2,7,2,3,3,3,3,3,3,3,3)
Count_Frame=data.frame(matrix(NA, nrow=(length(X)), ncol=3))
Count_Frame[1]=X
Count_Frame[2]=Y
Count_Frame[3]=Z
Counts=data.frame(table(Count_Frame))
There is the following problem: if I increase the value range in the vectors or use even more vectors the "Counts" dataframe quickly approaches its size limit due to the many 0-counts. Is there a way to exclude the 0-counts while generating "Counts"?
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(Count_Frame)), grouped by all the columns (.(X, Y, Z)), we get the number or rows (.N).
library(data.table)
setDT(Count_Frame)[,.N ,.(X, Y, Z)]
# X Y Z N
# 1: 4 1 4 7
# 2: 4 1 5 1
# 3: 1 1 6 1
# 4: 1 1 1 3
# 5: 1 2 1 2
# 6: 1 3 1 1
# 7: 1 4 1 1
# 8: 2 2 2 4
# 9: 2 3 7 1
#10: 2 4 2 1
#11: 3 1 3 2
#12: 3 2 3 2
#13: 3 3 3 2
#14: 3 4 3 2
Instead of naming all the columns, we can use names(Count_Frame) as well (if there are many columns)
setDT(Count_Frame)[,.N , names(Count_Frame)]
You can accomplish this with aggregate:
Count_Frame$one <- 1
aggregate(one ~ X1 + X2 + X3, data=Count_Frame, FUN=sum)
This will calculate the positive instances of table, but will not list the zero counts.
One solution is to create a combination of the column values and count those instead:
library(tidyr)
as.data.frame(table(unite(Count_Frame, tmp, X1, X2, X3))) %>%
separate(Var1, c('X1', 'X2', 'X3'))
Resulting output is:
X1 X2 X3 Freq
1 1 1 1 3
2 1 1 6 1
3 1 2 1 2
4 1 3 1 1
5 1 4 1 1
6 2 2 2 4
7 2 3 7 1
8 2 4 2 1
9 3 1 3 2
10 3 2 3 2
11 3 3 3 2
12 3 4 3 2
13 4 1 4 7
14 4 1 5 1
Or using plyr:
library(plyr)
count(Count_Frame, colnames(Count_Frame))
output
# > count(Count_Frame, colnames(Count_Frame))
# X1 X2 X3 freq
# 1 1 1 1 3
# 2 1 1 6 1
# 3 1 2 1 2
# 4 1 3 1 1
# 5 1 4 1 1
# 6 2 2 2 4
# 7 2 3 7 1
# 8 2 4 2 1
# 9 3 1 3 2
# 10 3 2 3 2
# 11 3 3 3 2
# 12 3 4 3 2
# 13 4 1 4 7
# 14 4 1 5 1

Resources