Remove rows after observing some specific row values in group id - r

I try to filter the group id and remove it after the first observation of sex==2). The data looks like
data<- data.frame( id= c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3,3 ,3,3,4,4,4), sex=c(1,1,2,2,1,1,1,2,2,2,1,1,2,1,1,2,1,2,2))
data
id sex
1 1
1 1
1 2
1 2
2 1
2 1
2 1
2 2
2 2
2 2
3 1
3 1
3 2
3 1
3 1
3 2
4 1
4 2
4 2
The desired output
id sex
1 1
1 1
1 2
2 1
2 1
2 1
2 2
3 1
3 1
3 2
3 1
3 1
3 2
4 1
4 2
I try to
library(dplyr)
data1 <- data %>% filter(type == 1 ) & silec(2))
But I got an error. Please anyone help?

Data
data<- data.frame( id= c(1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3,3 ,3,3,4,4,4), sex=c(1,1,2,2,1,1,1,2,2,2,1,1,2,1,1,2,1,2,2))
Code
data %>%
#Grouping by id
group_by(id) %>%
#Filter sex = 1 or the first time sex was equal 2
filter( sex == 1 | (cumsum(sex == 2) == 1))
Output
# A tibble: 14 x 2
# Groups: id [4]
id sex
<dbl> <dbl>
1 1 1
2 1 1
3 1 2
4 2 1
5 2 1
6 2 1
7 2 2
8 3 1
9 3 1
10 3 2
11 3 1
12 3 1
13 4 1
14 4 2

You may create a set of consecutive occurring 1's and 2's in each group. From each group select the rows till you encounter the 1st 2 in it.
library(dplyr)
library(data.table)
data %>%
group_by(id, grp = ceiling(rleid(sex)/2)) %>%
slice(seq_len(match(2, sex))) %>%
ungroup
select(-grp)
# id sex
# <dbl> <dbl>
# 1 1 1
# 2 1 1
# 3 1 2
# 4 2 1
# 5 2 1
# 6 2 1
# 7 2 2
# 8 3 1
# 9 3 1
#10 3 2
#11 3 1
#12 3 1
#13 3 2
#14 4 1
#15 4 2

Related

create new order for existing column values without reordering rows in dataframe - R

I have some results cluster labels from kmeans done on different ids (reprex example below). the problem is the kmeans clusters codes are not ordered consistently across ids although all ids have 3 clusters.
reprex = data.frame(id = rep(1:2, each = 41,
v1 = rep(seq(1:4), 2),
cluster = c(2,2,1,3,3,1,2,2))
reprex
id v1 cluster
1 1 1 2
2 1 2 2
3 1 3 1
4 1 4 3
5 2 1 3
6 2 2 1
7 2 3 2
8 2 4 2
what I want is that the variable cluster should always start with 1 within each ID. Note I don't want to reorder that dataframe by cluster, the order needs to remain the same. so the desired result would be:
reprex_desired<- data.frame(id = rep(1:2, each = 4),
v1 = rep(seq(1:4), 2),
cluster = c(2,2,1,3,3,1,2,2),
what_iWant = c(1,1,2,3,1,2,3,3))
reprex_desired
id v1 cluster what_iWant
1 1 1 2 1
2 1 2 2 1
3 1 3 1 2
4 1 4 3 3
5 2 1 3 1
6 2 2 1 2
7 2 3 2 3
8 2 4 2 3
We can use match after grouping by 'id'
library(dplyr)
reprex <- reprex %>%
group_by(id) %>%
mutate(what_IWant = match(cluster, unique(cluster))) %>%
ungroup
-output
reprex
# A tibble: 8 × 4
id v1 cluster what_IWant
<int> <int> <dbl> <int>
1 1 1 2 1
2 1 2 2 1
3 1 3 1 2
4 1 4 3 3
5 2 1 3 1
6 2 2 1 2
7 2 3 2 3
8 2 4 2 3
Here is a version with cumsum combined with lag:
library(dplyr)
df %>%
group_by(id) %>%
mutate(what_i_want = cumsum(cluster != lag(cluster, def = first(cluster)))+1)
id v1 cluster what_i_want
<int> <int> <dbl> <dbl>
1 1 1 2 1
2 1 2 2 1
3 1 3 1 2
4 1 4 3 3
5 2 1 3 1
6 2 2 1 2
7 2 3 2 3
8 2 4 2 3

calculating the mean by another variable's categories in r

I have an example of a dataset like this:
id <- c(1,1,1, 2,2,2, 3,3, 4,4, 5,5,5,5, 6,6,6, 7, 8,8)
item.id <- c(1,1,2, 1,1,1 ,1,1, 1,2, 1,2,2,2, 1,1,1, 1, 1,2)
sequence <- c(1,2,1, 1,2,3, 1,2, 1,1, 1,1,2,3, 1,2,3, 1, 1,1)
score <- c(0,0,0, 0,0,1, 1,0, 1,1, 1,0,1,1, 0,0,0, 1, 0,1)
category <- c(2,2,2, 3,3,3, 1,1, 3,3, 1,1,1,1, 4,4,4, 2, 3,3)
data <- data.frame("id"=id, "item.id"=item.id, "sequence"=sequence, "score"=score, "category"=category)
> data
id item.id sequence score category
1 1 1 1 0 2
2 1 1 2 0 2
3 1 2 1 0 2
4 2 1 1 0 3
5 2 1 2 0 3
6 2 1 3 1 3
7 3 1 1 1 1
8 3 1 2 0 1
9 4 1 1 1 3
10 4 2 1 1 3
11 5 1 1 1 1
12 5 2 1 0 1
13 5 2 2 1 1
14 5 2 3 1 1
15 6 1 1 0 4
16 6 1 2 0 4
17 6 1 3 0 4
18 7 1 1 1 2
19 8 1 1 0 3
20 8 2 1 1 3
id represents persons, item.id is for questions. sequence is for the attempt to change the response, and the score is the score of the item, category is the category each student falls in.
What I want to do is to grab the maximum sequence number for each id per item.id, then calculate the mean score of the maximum sequence value for each category. I was able to complete the first step but could not figure out how to take the cross tab of the mean of the maximum sequence number per category.
library(dplyr)
data %>%
group_by(id,item.id) %>%
summarize(max.seq = max(sequence))
# A tibble: 12 x 3
# Groups: id [?]
id item.id max.seq
<dbl> <dbl> <dbl>
1 1 1 2
2 1 2 1
3 2 1 3
4 3 1 2
5 4 1 1
6 4 2 1
7 5 1 1
8 5 2 3
9 6 1 3
10 7 1 1
11 8 1 1
12 8 2 1
The result of the second step should be:
category 1 2 3 4
mean(max(seq)) 2 1.33 1.4 3
Any suggestions?
Thanks in advance!
You need to get the category value into the summary table. Since the category value is constant for each id, item.id combination using the mean in the summary function is one way.
library(dplyr)
data %>%
group_by(id,item.id) %>%
summarize(cat=mean(category), max.seq = max(sequence)) %>%
group_by(cat) %>% summarize(mean(max.seq))
# A tibble: 4 x 2
cat `mean(max.seq)`
<dbl> <dbl>
1 1 2
2 2 1.33
3 3 1.4
4 4 3
My calculations are slightly different from yours, please double check my method before accepting.

Add a count column and count twice if a certain condition is met

I am wondering if there is a way to make a conditional column-count by a group, adding 1 to a row_number or rowid if a certain value is met (in this case 0). For example:
df<-data.frame(group=c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,3,3,3),
condition=c(1,0,1,1,1,0,0,1,1,0,1,1,0, 1),
want=c(1, 3, 4,5,1,3,5,6,7,2,3,4,6,7))
group condition want
1 1 1 1
2 1 0 3
3 1 1 4
4 1 1 5
5 2 1 1
6 2 0 3
7 2 0 5
8 2 1 6
9 2 1 7
10 3 0 2
11 3 1 3
12 3 1 4
13 3 0 6
14 3 1 7
I think this might involve making a row_number per group and then making a customized row_number but I am open to suggestions. It is kind of a work-around method to "break up" my data when a 0 appears.
Using dplyr, for each group of data (group-by(group)) we can add a column which has a counter from 1 to the length of each group (i.e. n()). By adding a cumulative sum of condition == 0, that counter will jump one more, whenever your desired condition is met.
library(dplyr)
df1 %>%
group_by(group) %>%
mutate(desired = (1:n()) + cumsum(condition == 0))
Output:
#> # A tibble: 14 x 3
#> # Groups: group [3]
#> group condition desired
#> <dbl> <dbl> <int>
#> 1 1 1 1
#> 2 1 0 3
#> 3 1 1 4
#> 4 1 1 5
#> 5 2 1 1
#> 6 2 0 3
#> 7 2 0 5
#> 8 2 1 6
#> 9 2 1 7
#> 10 3 0 2
#> 11 3 1 3
#> 12 3 1 4
#> 13 3 0 6
#> 14 3 1 7
Data:
df1 <- data.frame(group=c(1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3,3,3,3),
condition=c(1,0,1,1,1,0,0,1,1,0,1,1,0, 1))
You can do:
transform(df, want = ave(condition, group, FUN = function(x) cumsum(x + (x == 0) * 2 )))
group condition want
1 1 1 1
2 1 0 3
3 1 1 4
4 1 1 5
5 2 1 1
6 2 0 3
7 2 0 5
8 2 1 6
9 2 1 7
10 3 0 2
11 3 1 3
12 3 1 4
13 3 0 6
14 3 1 7

R cummax function with NA

data
data=data.frame("person"=c(1,1,1,1,1,1,1,1,1,2,2,2,2,2,2,2,2,2),
"score"=c(1,2,1,2,3,1,3,NA,4,2,1,NA,2,NA,3,1,2,4),
"want"=c(1,2,1,2,3,3,3,3,4,2,1,1,2,2,3,3,3,4))
attempt
library(dplyr)
data = data %>%
group_by(person) %>%
mutate(wantTEST = ifelse(score >= 3 | (row_number() >= which.max(score == 3)),
cummax(score), score),
wantTEST = replace(wantTEST, duplicated(wantTEST == 4) & wantTEST == 4, NA))
i am basically working to use the cummax function but only under specific circumstances. i want to keep any values (1-2-1-1) except if there is a 3 or 4 (1-2-1-3-2-1-4) should be (1-2-1-3-3-4). if there is NA value i want to carry forward previous value. thank you.
Here's one way with tidyverse. You may want to use fill() after group_by() but that's somewhat unclear.
data %>%
fill(score) %>%
group_by(person) %>%
mutate(
w = ifelse(cummax(score) > 2, cummax(score), score)
) %>%
ungroup()
# A tibble: 18 x 4
person score want w
<dbl> <dbl> <dbl> <dbl>
1 1 1 1 1
2 1 2 2 2
3 1 1 1 1
4 1 2 2 2
5 1 3 3 3
6 1 1 3 3
7 1 3 3 3
8 1 3 3 3
9 1 4 4 4
10 2 2 2 2
11 2 1 1 1
12 2 1 1 1
13 2 2 2 2
14 2 2 2 2
15 2 3 3 3
16 2 1 3 3
17 2 2 3 3
18 2 4 4 4
One way to do this is to first fill NA values and then for each row check if anytime the score of 3 or more is passed in the group. If the score of 3 is reached till that point we take the max score until that point or else return the same score.
library(tidyverse)
data %>%
fill(score) %>%
group_by(person) %>%
mutate(want1 = map_dbl(seq_len(n()), ~if(. >= which.max(score == 3))
max(score[seq_len(.)]) else score[.]))
# person score want want1
# <dbl> <dbl> <dbl> <dbl>
# 1 1 1 1 1
# 2 1 2 2 2
# 3 1 1 1 1
# 4 1 2 2 2
# 5 1 3 3 3
# 6 1 1 3 3
# 7 1 3 3 3
# 8 1 3 3 3
# 9 1 4 4 4
#10 2 2 2 2
#11 2 1 1 1
#12 2 1 1 1
#13 2 2 2 2
#14 2 2 2 2
#15 2 3 3 3
#16 2 1 3 3
#17 2 2 3 3
#18 2 4 4 4
Another way is to use accumulate from purrr. I use if_else_ from hablar for type stability:
library(tidyverse)
library(hablar)
data %>%
fill(score) %>%
group_by(person) %>%
mutate(wt = accumulate(score, ~if_else_(.x > 2, max(.x, .y), .y)))

R - Nested list to tibble

I have a nested list like so:
> ex <- list(list(c("This", "is", "an", "example", "."), c("I", "really", "hate", "examples", ".")), list(c("How", "do", "you", "feel", "about", "examples", "?")))
> ex
[[1]]
[[1]][[1]]
[1] "This" "is" "an" "example" "."
[[1]][[2]]
[1] "I" "really" "hate" "examples" "."
[[2]]
[[2]][[1]]
[1] "How" "do" "you" "feel" "about" "examples" "?"
I want to convert it to a tibble like so:
> tibble(d_id = as.integer(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2)),
+ s_id = as.integer(c(1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1)),
+ t_id = as.integer(c(1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 7)),
+ token = c("This", "is", "an", "example", ".", "I", "really",
+ "hate", "examples", ".", "How", "do", "you", "feel", "about", "examples", "?"))
# A tibble: 17 x 4
d_id s_id t_id token
<int> <int> <int> <chr>
1 1 1 1 This
2 1 1 2 is
3 1 1 3 an
4 1 1 4 example
5 1 1 5 .
6 1 2 1 I
7 1 2 2 really
8 1 2 3 hate
9 1 2 4 examples
10 1 2 5 .
11 2 1 1 How
12 2 1 2 do
13 2 1 3 you
14 2 1 4 feel
15 2 1 5 about
16 2 1 6 examples
17 2 1 7 ?
What is the most efficient way for me to perform this? Preferably using tidyverse functionality?
Time to get some sequences working, which should be very efficient:
d_id <- rep(seq_along(ex), lengths(ex))
s_id <- sequence(lengths(ex))
t_id <- lengths(unlist(ex, rec=FALSE))
data.frame(
d_id = rep(d_id, t_id),
s_id = rep(s_id, t_id),
t_id = sequence(t_id),
token = unlist(ex)
)
# d_id s_id t_id token
#1 1 1 1 This
#2 1 1 2 is
#3 1 1 3 an
#4 1 1 4 example
#5 1 1 5 .
#6 1 2 1 I
#7 1 2 2 really
#8 1 2 3 hate
#9 1 2 4 examples
#10 1 2 5 .
#11 2 1 1 How
#12 2 1 2 do
#13 2 1 3 you
#14 2 1 4 feel
#15 2 1 5 about
#16 2 1 6 examples
#17 2 1 7 ?
This will run in about 2 seconds for a 500K sample of your ex list. I suspect that will be hard to beat in terms of efficiency.
We can do
ex %>%
set_names(seq_along(ex)) %>%
map( ~ set_names(.x, seq_along(.x)) %>%
stack) %>%
bind_rows(.id = 'd_id') %>%
group_by(d_id, s_id = ind) %>%
mutate(t_id = row_number()) %>%
select(d_id, s_id, t_id, token = values)
# A tibble: 17 x 4
# Groups: d_id, s_id [3]
# d_id s_id t_id token
# <chr> <chr> <int> <chr>
# 1 1 1 1 This
# 2 1 1 2 is
# 3 1 1 3 an
# 4 1 1 4 example
# 5 1 1 5 .
# 6 1 2 1 I
# 7 1 2 2 really
# 8 1 2 3 hate
# 9 1 2 4 examples
#10 1 2 5 .
#11 2 1 1 How
#12 2 1 2 do
#13 2 1 3 you
#14 2 1 4 feel
#15 2 1 5 about
#16 2 1 6 examples
#17 2 1 7 ?
You can use melt from the reshape2 package:
library(data.table)
setDT(melt(ex))[, .(d_id = L1, s_id = L2, t_id = rowid(L1, L2), token = value)]
d_id s_id t_id token
1: 1 1 1 This
2: 1 1 2 is
3: 1 1 3 an
4: 1 1 4 example
5: 1 1 5 .
6: 1 2 1 I
7: 1 2 2 really
8: 1 2 3 hate
9: 1 2 4 examples
10: 1 2 5 .
11: 2 1 1 How
12: 2 1 2 do
13: 2 1 3 you
14: 2 1 4 feel
15: 2 1 5 about
16: 2 1 6 examples
17: 2 1 7 ?
I'm showing it here with data.table, since I know how to do the column selection and renaming in one step from there (though it should be no trouble with dplyr instead). The melt.list function is coming from reshape2.
Another tidyverse solution:
library(tidyverse)
ex %>%
modify_depth(-1,~tibble(token=.x) %>% rowid_to_column("t_id")) %>%
map(~map_dfr(.x,identity,.id = "s_id")) %>%
map_dfr(identity,.id = "d_id")
# # A tibble: 17 x 4
# d_id s_id t_id token
# <chr> <chr> <int> <chr>
# 1 1 1 1 This
# 2 1 1 2 is
# 3 1 1 3 an
# 4 1 1 4 example
# 5 1 1 5 .
# 6 1 2 1 I
# 7 1 2 2 really
# 8 1 2 3 hate
# 9 1 2 4 examples
# 10 1 2 5 .
# 11 2 1 1 How
# 12 2 1 2 do
# 13 2 1 3 you
# 14 2 1 4 feel
# 15 2 1 5 about
# 16 2 1 6 examples
# 17 2 1 7 ?

Resources