replace with 0 duplicate variable according to ID - r

I have a dataframe like this one:
df
ID job_code
1 8
1 8
1 8
2 7
2 7
2 4
3 1
3 2
If an individual has several times the same job code, I would like to keep only the first one and replace the others by 0, to obtain a dataframe like this one:
df
ID job_code job_code_2
1 8 8
1 8 0
1 8 0
2 7 7
2 7 0
2 4 4
3 1 1
3 2 2
I thought of using function :
dataframe %>%
group_by(ID) %>%
and replace
but I am not sure how.
Thank you in advance for your help.

library(tidyverse)
df <- data.frame(
ID = c(1L, 1L, 1L, 2L, 2L, 2L, 3L, 3L),
job_code = c(8L, 8L, 8L, 7L, 7L, 4L, 1L, 2L)
)
df %>%
group_by(ID, job_code) %>%
mutate(job_code2 = job_code * +(row_number() == 1)) %>%
ungroup()
#> # A tibble: 8 x 3
#> ID job_code job_code2
#> <int> <int> <int>
#> 1 1 8 8
#> 2 1 8 0
#> 3 1 8 0
#> 4 2 7 7
#> 5 2 7 0
#> 6 2 4 4
#> 7 3 1 1
#> 8 3 2 2
Created on 2022-03-23 by the reprex package (v2.0.1)

Use duplicated:
df %>%
group_by(ID) %>%
mutate(job_code2 = ifelse(duplicated(job_code), 0, job_code)) %>%
ungroup()
in base R you can use tapply + duplicated:
df$job_code2 <- unlist(tapply(df$job_code, df$ID, function(x) ifelse(duplicated(x), 0, x)))

Another possible solution:
library(tidyverse)
df <- read_table("ID job_code
1 8
1 8
1 8
2 7
2 7
2 4
3 1
3 2")
df %>%
group_by(ID, job_code) %>%
mutate(job_code = if_else(row_number() > 1, 0, job_code)) %>%
ungroup
#> # A tibble: 8 x 2
#> ID job_code
#> <dbl> <dbl>
#> 1 1 8
#> 2 1 0
#> 3 1 0
#> 4 2 7
#> 5 2 0
#> 6 2 4
#> 7 3 1
#> 8 3 2

the first function is good but I don't know why there are some subjects where it doesn't work. For subjects where there is already a code that has been released for a previous subject it doesn't work
for example, for subject 4 I get a 0 when I should get an 8
I have this :
ID job_code job_code_2
1 8 8
1 8 0
1 8 0
2 7 7
2 7 0
2 4 4
3 1 1
3 2 2
4 8 0
Instead of this :
ID job_code job_code_2
1 8 8
1 8 0
1 8 0
2 7 7
2 7 0
2 4 4
3 1 1
3 2 2
4 8 8

Related

How to create a column that sum up the rows by group

the data example is like this below:
A B C
0 1 2
0 1 2
1 10 15
1 5 15
0 2 5
0 3 5
1 20 50
1 30 50
Above A and B is the original data, and C is the column that I want to create. C is the group sum of B based on the same and adjacent A value.Even though there are some A=0, if they are not adjacent, then it should be not sum together.
tidyverse
library(tidyverse)
df <- data.frame(
A = c(0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L),
B = c(1L, 1L, 10L, 5L, 2L, 3L, 20L, 30L)
)
df %>%
group_by(grp = data.table::rleid(A)) %>%
mutate(res = sum(B)) %>%
ungroup() %>%
select(-grp)
#> # A tibble: 8 × 3
#> A B res
#> <int> <int> <int>
#> 1 0 1 2
#> 2 0 1 2
#> 3 1 10 15
#> 4 1 5 15
#> 5 0 2 5
#> 6 0 3 5
#> 7 1 20 50
#> 8 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
data.table
library(data.table)
setDT(df)[, res := sum(B), by = rleid(A)][]
#> A B res
#> 1: 0 1 2
#> 2: 0 1 2
#> 3: 1 10 15
#> 4: 1 5 15
#> 5: 0 2 5
#> 6: 0 3 5
#> 7: 1 20 50
#> 8: 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
base
df$res <- with(df, ave(B, data.table::rleid(A), FUN = sum))
df
#> A B res
#> 1 0 1 2
#> 2 0 1 2
#> 3 1 10 15
#> 4 1 5 15
#> 5 0 2 5
#> 6 0 3 5
#> 7 1 20 50
#> 8 1 30 50
Created on 2022-04-27 by the reprex package (v2.0.1)
library(tidyverse)
data <- tribble(
~A, ~B, ~C,
0, 1, 2,
0, 1, 2,
1, 10, 15,
1, 5, 15,
0, 2, 5,
0, 3, 5,
1, 20, 50,
1, 30, 50
)
data <- data %>% select(-C)
rle_group <- function(x) {
rle <- rle(x)
rle$values <- rle$values %>%
length() %>%
seq()
inverse.rle(rle)
}
data %>%
mutate(
group = rle_group(A)
) %>%
group_by(group) %>%
mutate(
C = sum(B)
)
#> # A tibble: 8 × 4
#> # Groups: group [4]
#> A B group C
#> <dbl> <dbl> <int> <dbl>
#> 1 0 1 1 2
#> 2 0 1 1 2
#> 3 1 10 2 15
#> 4 1 5 2 15
#> 5 0 2 3 5
#> 6 0 3 3 5
#> 7 1 20 4 50
#> 8 1 30 4 50
Created on 2022-04-27 by the reprex package (v2.0.0)
A dplyr solution without any rle functions:
library(dplyr)
df %>%
group_by(grp = cumsum(c(TRUE, diff(A) != 0))) %>%
mutate(C = sum(B)) %>%
ungroup() %>%
select(-grp)
# A tibble: 8 x 3
A B C
<int> <int> <int>
1 0 1 2
2 0 1 2
3 1 10 15
4 1 5 15
5 0 2 5
6 0 3 5
7 1 20 50
8 1 30 50
Its base equivalent:
within(df, C <- ave(B, cumsum(c(TRUE, diff(A) != 0)), FUN = sum))
A B C
1 0 1 2
2 0 1 2
3 1 10 15
4 1 5 15
5 0 2 5
6 0 3 5
7 1 20 50
8 1 30 50

By group relative order

I have a data set that looks like this
ID
Week
1
3
1
5
1
5
1
8
1
11
1
16
2
2
2
2
2
3
2
3
2
9
Now, what I would like to do is to add another column to the DataFrame so that, for every ID I will mark the week's relative position. More elaborately, I would like to the mark ID's earliest week (smallest number) as 1, then the next week for the ID as 2 and so forth, where if there are two observations of the same week they get the same number.
So, in the above example I should get:
ID
Week
Order
1
3
1
1
5
2
1
5
2
1
8
3
1
11
4
1
16
5
2
2
1
2
2
1
2
3
2
2
3
2
2
9
3
How could I achieve this?
Thank you very much!
A base R option using ave + match
transform(
df,
Order = ave(Week,
ID,
FUN = function(x) match(x, sort(unique(x)))
)
)
or ave + order (thank #IRTFM for comments)
transform(
df,
Order = ave(Week,
ID,
FUN = order
)
)
gives
ID Week Order
1 1 3 1
2 1 5 2
3 1 5 2
4 1 8 3
5 1 11 4
6 1 16 5
7 2 2 1
8 2 2 1
9 2 3 2
10 2 3 2
11 2 9 3
A data.table option with frank
> setDT(df)[, Order := frank(Week, ties.method = "dense"), ID][]
ID Week Order
1: 1 3 1
2: 1 5 2
3: 1 5 2
4: 1 8 3
5: 1 11 4
6: 1 16 5
7: 2 2 1
8: 2 2 1
9: 2 3 2
10: 2 3 2
11: 2 9 3
Data
> dput(df)
structure(list(ID = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L,
2L), Week = c(3L, 5L, 5L, 8L, 11L, 16L, 2L, 2L, 3L, 3L, 9L)), class = "data.frame", row.names =
c(NA,
-11L))
You can use dense_rank in dplyr :
library(dplyr)
df %>% group_by(ID) %>% mutate(Order = dense_rank(Week)) %>% ungroup
# ID Week Order
# <int> <int> <int>
# 1 1 3 1
# 2 1 5 2
# 3 1 5 2
# 4 1 8 3
# 5 1 11 4
# 6 1 16 5
# 7 2 2 1
# 8 2 2 1
# 9 2 3 2
#10 2 3 2
#11 2 9 3

Count number of new and lost friends between two data frames in R

I have two data frames of the same respondents, one from Time 1 and the next from Time 2. In each wave they nominated their friends, and I want to know:
1) how many friends are nominated in Time 2 but not in Time 1 (new friends)
2) how many friends are nominated in Time 1 but not in Time 2 (lost friends)
Sample data:
Time 1 DF
ID friend_1 friend_2 friend_3
1 4 12 7
2 8 6 7
3 9 NA NA
4 15 7 2
5 2 20 7
6 19 13 9
7 12 20 8
8 3 17 10
9 1 15 19
10 2 16 11
Time 2 DF
ID friend_1 friend_2 friend_3
1 4 12 3
2 8 6 14
3 9 NA NA
4 15 7 2
5 1 17 9
6 9 19 NA
7 NA NA NA
8 7 1 16
9 NA 10 12
10 7 11 9
So the desired DF would include these columns (EDIT filled in columns):
ID num_newfriends num_lostfriends
1 1 1
2 1 1
3 0 0
4 0 0
5 3 3
6 0 1
7 0 3
8 3 3
9 2 3
10 2 1
EDIT2:
I've tried doing an anti join
df3 <- anti_join(df1, df2)
But this method doesn't take into account friend id numbers that might appear in a different column in time 2 (For example respondent #6 friend 9 and 19 are in T1 and T2 but in different columns in each time)
Another option:
library(tidyverse)
left_join(
gather(df1, key, x, -ID),
gather(df2, key, y, -ID),
by = c("ID", "key")
) %>%
group_by(ID) %>%
summarise(
num_newfriends = sum(!y[!is.na(y)] %in% x[!is.na(x)]),
num_lostfriends = sum(!x[!is.na(x)] %in% y[!is.na(y)])
)
Output:
# A tibble: 10 x 3
ID num_newfriends num_lostfriends
<int> <int> <int>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
Simple comparisons would be an option
library(tidyverse)
na_sums_old <- rowSums(is.na(time1))
na_sums_new <- rowSums(is.na(time2))
kept_friends <- map_dbl(seq(nrow(time1)), ~ sum(time1[.x, -1] %in% time2[.x, -1]))
kept_friends <- kept_friends - na_sums_old * (na_sums_new >= 1)
new_friends <- 3 - na_sums_new - kept_friends
lost_friends <- 3 - na_sums_old - kept_friends
tibble(ID = time1$ID, new_friends = new_friends, lost_friends = lost_friends)
# A tibble: 10 x 3
ID new_friends lost_friends
<int> <dbl> <dbl>
1 1 1 1
2 2 1 1
3 3 0 0
4 4 0 0
5 5 3 3
6 6 0 1
7 7 0 3
8 8 3 3
9 9 2 3
10 10 2 2
You can make anti_join work by first pivoting to a "long" data frame.
df1 <- df1 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
df2 <- df2 %>%
pivot_longer(starts_with("friend_"), values_to = "friend") %>%
drop_na()
head(df1)
#> # A tibble: 6 x 3
#> ID name friend
#> <int> <chr> <int>
#> 1 1 friend_1 4
#> 2 1 friend_2 12
#> 3 1 friend_3 7
#> 4 2 friend_1 8
#> 5 2 friend_2 6
#> 6 2 friend_3 7
lost_friends <- anti_join(df1, df2, by = c("ID", "friend"))
new_fiends <- anti_join(df2, df1, by = c("ID", "friend"))
respondents <- distinct(df1, ID)
respondents %>%
full_join(
count(lost_friends, ID, name = "num_lost_friends")
) %>%
full_join(
count(new_fiends, ID, name = "num_new_friends")
) %>%
mutate_at(vars(starts_with("num_")), replace_na, 0)
#> Joining, by = "ID"
#> Joining, by = "ID"
#> # A tibble: 10 x 3
#> ID num_lost_friends num_new_friends
#> <int> <dbl> <dbl>
#> 1 1 1 1
#> 2 2 1 1
#> 3 3 0 0
#> 4 4 0 0
#> 5 5 3 3
#> 6 6 1 0
#> 7 7 3 0
#> 8 8 3 3
#> 9 9 3 2
#> 10 10 2 2
Created on 2019-11-01 by the reprex package (v0.3.0)

Order values within column according to values within different column by group in R

I have the following panel data set:
group i f r d
1 4 8 3 3
1 9 4 5 1
1 2 2 2 2
2 5 5 3 2
2 3 9 3 3
2 9 1 3 1
I want to reorder column i in this data frame according to values in column d for each group. So the highest value for group 1 in column i should correspond to the highest value in column d. In the end my data.frame should look like this:
group i f r d
1 9 8 3 3
1 2 4 5 1
1 4 2 2 2
2 5 5 3 2
2 9 9 3 3
2 3 1 3 1
Here is a dplyr solution.
First, group by group. Then get the permutation rearrangement of column d in a temporary new column, ord and use it to reorder i.
library(dplyr)
df1 %>%
group_by(group) %>%
mutate(ord = order(d),
i = i[ord]) %>%
ungroup() %>%
select(-ord)
## A tibble: 6 x 5
# group i f r d
# <int> <int> <int> <int> <int>
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 9 5 3 2
#5 2 5 9 3 3
#6 2 3 1 3 1
original (wrong)
You can achieve this using dplyr and rank:
library(dplyr)
df1 %>% group_by(group) %>%
mutate(i = i[rev(rank(d))])
Edit
This question is actually trickier than it first seems and the original answer I posted is incorrect. The correct solution orders by i before subsetting by the rank of d. This gives OP's desired output which my previous answer did not (not paying attention!)
df1 %>% group_by(group) %>%
mutate(i = i[order(i)][rank(d)])
# A tibble: 6 x 5
# Groups: group [2]
# group i f r d
# <int> <int> <int> <int> <int>
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 5 5 3 2
#5 2 9 9 3 3
#6 2 3 1 3 1
There is some confusion regarding the expected output. Here I am showing a way to get both the versions of the output.
A base R using split and mapply
df$i <- c(mapply(function(x, y) sort(y)[x],
split(df$d, df$group), split(df$i, df$group)))
df
# group i f r d
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 5 5 3 2
#5 2 9 9 3 3
#6 2 3 1 3 1
Or another version
df$i <- c(mapply(function(x, y) y[order(x)],
split(df$d, df$group), split(df$i, df$group)))
df
# group i f r d
#1 1 9 8 3 3
#2 1 2 4 5 1
#3 1 4 2 2 2
#4 2 9 5 3 2
#5 2 5 9 3 3
#6 2 3 1 3 1
We can also use dplyr for this :
For 1st version
library(dplyr)
df %>%
group_by(group) %>%
mutate(i = sort(i)[d])
2nd version is already shown by #Rui using order
df %>%
group_by(group) %>%
mutate(i = i[order(d)])
An option with data.table
library(data.table)
setDT(df1)[, i := i[order(d)], group]
df1
# group i f r d
#1: 1 9 8 3 3
#2: 1 2 4 5 1
#3: 1 4 2 2 2
#4: 2 9 5 3 2
#5: 2 5 9 3 3
#6: 2 3 1 3 1
If we need the second version
setDT(df1)[, i := sort(i)[d], group]
data
df1 <- structure(list(group = c(1L, 1L, 1L, 2L, 2L, 2L), i = c(4L, 9L,
2L, 5L, 3L, 9L), f = c(8L, 4L, 2L, 5L, 9L, 1L), r = c(3L, 5L,
2L, 3L, 3L, 3L), d = c(3L, 1L, 2L, 2L, 3L, 1L)), class = "data.frame",
row.names = c(NA,
-6L))

Finding values in consecutive rows

An example of the dataframe I have is given below.
ID X
1 1
2 2
3 1
4 0
5 0
6 1
7 4
8 5
9 6
10 7
11 0
12 0
I want to apply logic to it that looks to see whether 3 or more consecutive rows have a value >0 in it. If they do I want to flag them in another column. Hence the output will look as follows.
ID X Y
1 1 1
2 2 1
3 1 1
4 0 0
5 0 0
6 1 1
7 4 1
8 5 1
9 6 1
10 7 1
11 0 0
12 0 0
EXTENSION -
How would I get the following output, givibng a different Y value for each group?
ID X Y
1 1 1
2 2 1
3 1 1
4 0 0
5 0 0
6 1 2
7 4 2
8 5 2
9 6 2
10 7 2
11 0 0
12 0 0
One option with base R. Using rle to find the adjacent values in 'X' that are greater than 0, then do the replication based on the lengths
df1$Y <- with(rle(df1$X > 0), as.integer(rep(values & lengths > 2, lengths)))
df1$Y
#[1] 1 1 1 0 0 1 1 1 1 1 0 0
For the updated case in the OP's post
df1$Y <- inverse.rle(within.list(rle(df1$X > 0), {
i1 <- values & (lengths > 2)
values[i1] <- seq_along(values[i1])}))
df1$Y
#[1] 1 1 1 0 0 2 2 2 2 2 0 0
Or using rleid from data.table
library(data.table)
setDT(df1)[, Y := as.integer((.N > 2) * (X > 0)),rleid(X > 0)]
data
df1 <- structure(list(ID = 1:12, X = c(1L, 2L, 1L, 0L, 0L, 1L, 4L, 5L,
6L, 7L, 0L, 0L)), class = "data.frame", row.names = c(NA, -12L
))
We can use rleid from data.table to create groups and use it in ave and get length of each group and assign 1 to groups which has length greater than equal to 3.
library(data.table)
df$Y <- as.integer(ave(df$X, rleid(df$X > 0), FUN = length) >= 3)
df
# ID X Y
#1 1 1 1
#2 2 2 1
#3 3 1 1
#4 4 0 0
#5 5 0 0
#6 6 1 1
#7 7 4 1
#8 8 5 1
#9 9 6 1
#10 10 7 1
#11 11 0 0
#12 12 0 0
EDIT
For updated post we could include the above data.table part with dplyr by doing
library(dplyr)
library(data.table)
df %>%
group_by(group = rleid(X > 0)) %>%
mutate(Y = ifelse(n() >= 3 & row_number() == 1, 1, 0)) %>%
ungroup() %>%
mutate(Y = cumsum(Y) * Y) %>%
group_by(group) %>%
mutate(Y = first(Y)) %>%
ungroup() %>%
select(-group)
# ID X Y
# <int> <int> <dbl>
# 1 1 1 1
# 2 2 2 1
# 3 3 1 1
# 4 4 0 0
# 5 5 0 0
# 6 6 1 2
# 7 7 4 2
# 8 8 5 2
# 9 9 6 2
#10 10 7 2
#11 11 0 0
#12 12 0 0

Resources