How to change column values based on duplication in another column R - r

My data looks like this:
data <- data.frame(grupoaih = c("09081997", "13122006", "09081997", "22031969"),
NMM_PROC_BR = c(1, 1, 0, 1),
NMM_CID = c(0, 1, 1, 0),
CPAV_PROC_BR = c(0, 0, 0, 1),
CPAV_CID = c(1, 1, 0, 1))
grupoaih NMM_PROC_BR NMM_CID CPAV_PROC_BR CPAV_CID
1 09081997 1 0 0 1
2 13122006 1 1 0 1
3 09081997 0 1 0 0
4 22031969 1 0 1 1
How can I assign the value 1 when "grupoaih" is a duplicate so the other 4 variables get filled equally like this:
data2 <- data.frame(grupoaih = c("09081997", "13122006", "09081997", "22031969"),
NMM_PROC_BR = c(1, 1, 1, 1),
NMM_CID = c(1, 1, 1, 0),
CPAV_PROC_BR = c(0, 0, 0, 1),
CPAV_CID = c(1, 1, 1, 1))
grupoaih NMM_PROC_BR NMM_CID CPAV_PROC_BR CPAV_CID
1 09081997 1 1 0 1
2 13122006 1 1 0 1
3 09081997 1 1 0 1
4 22031969 1 0 1 1
This only applies if grupoaih is duplicated and any of the 4 variables are filled with 1. If both are 0 in all variables, they stay as they are.

You can use a group_by and then an n() to check if there are duplicates. . stands for the original value, and ~ indicates a formula.
library(dplyr)
data %>%
group_by(grupoaih) %>%
mutate(across(c("NMM_PROC_BR", "NMM_CID", "CPAV_CID"), ~ifelse(n() > 1, 1, .))) %>%
ungroup()
# # A tibble: 4 × 5
# grupoaih NMM_PROC_BR NMM_CID CPAV_PROC_BR CPAV_CID
# <chr> <dbl> <dbl> <dbl> <dbl>
# 1 09081997 1 1 0 1
# 2 13122006 1 1 0 1
# 3 09081997 1 1 0 1
# 4 22031969 1 0 1 1

It could work with max after grouping
library(dplyr)
data %>%
group_by(grupoaih) %>%
mutate(across(everything(), max)) %>%
ungroup
-output
# A tibble: 4 × 5
grupoaih NMM_PROC_BR NMM_CID CPAV_PROC_BR CPAV_CID
<chr> <dbl> <dbl> <dbl> <dbl>
1 09081997 1 1 0 1
2 13122006 1 1 0 1
3 09081997 1 1 0 1
4 22031969 1 0 1 1
Or use fmax from collapse
library(collapse)
data[-1] <- fmax(data[-1], data$grupoaih, TRA = 1)

Related

How to track changes in rows/lines of data frame?

Suppose I have the following dataframe
df:
df=data.frame(ID=c(123100,123200,123300,123400,123500),"2014"=c(1,1,1,2,3),"2015"=c(1,1,1,2,3),"2016"=c(2,1,1,2,1), "2017"=c(2,1,1,2,1), "2018"=c(2,3,1,2,1) )
Now, I want to find out, for which ID the data has changed in which year. So for example, in 2016 ID 123100 has changed from 1 to 2. I would like to add new columns for change (1 = change, 0 = no change), year of change, old value (1,2 or 3) and new value (1,2,3).
In the end it should look like this:
df_final=data.frame(ID=c(123100,123200,123300,123400,123500),"2014"=c(1,1,1,2,3),"2015"=c(1,1,1,2,3),"2016"=c(2,1,1,2,1), "2017"=c(2,1,1,2,1), "2018"=c(2,3,1,2,1), "change"=c(1,1,0,0,1),
"year"=c(2016, 2018, 0, 0, 2016), "before"=c(1,1,0,0,3), "after"=c(2, 3, 0, 0, 1))
I couldn't find any satisfying solution on here, so I hope you can help me.
Here's a base R method.
It may be best to have the IDs with no change as NA. If you really want zeroes, just change c(NA, NA, NA) in the following code to c(0, 0, 0)
Note that in your example data frames, if you run the code as-is, the column names for each year all start with an "x" - you can prevent this by adding the check.names = FALSE argument to the data.frame function.
cbind(df, setNames(as.data.frame(t(apply(df[-1], 1, function(x) {
y <- which(diff(x) != 0)
if(length(y)) c(as.numeric(names(y)), x[y], x[y+1])
else c(NA, NA, NA)
}))), c("Year", "Before", "After")))
#> ID 2014 2015 2016 2017 2018 Year Before After
#> 1 123100 1 1 2 2 2 2016 1 2
#> 2 123200 1 1 1 1 3 2018 1 3
#> 3 123300 1 1 1 1 1 NA NA NA
#> 4 123400 2 2 2 2 2 NA NA NA
#> 5 123500 3 3 1 1 1 2016 3 1
Data used
df <- structure(list(ID = c(123100, 123200, 123300, 123400, 123500),
`2014` = c(1, 1, 1, 2, 3), `2015` = c(1, 1, 1, 2, 3), `2016` = c(2,
1, 1, 2, 1), `2017` = c(2, 1, 1, 2, 1), `2018` = c(2, 3,
1, 2, 1)), class = "data.frame", row.names = c(NA, -5L))
Created on 2022-06-18 by the reprex package (v2.0.1)
here is an optional tidyverse approach:
library(tidyverse)
# join resume df to current df
dplyr::left_join(df,
# make df long to build groupings by ID
tidyr::pivot_longer(df, -ID) %>%
dplyr::group_by(ID) %>%
# order just to be sure
dplyr::arrange(ID, name) %>%
# generate year number, before and after values
dplyr::mutate(year = readr::parse_number(name),
before = lag(value),
# if there is no after value use current value
after = ifelse(is.na(lead(value)), value, lead(value))) %>%
# filter where preceding uneven current
dplyr::filter(before != value) %>%
# unselect obsolete columns
dplyr::select(-name, -value),
by = "ID") %>%
# fill up empty fields with zeros
dplyr::mutate(dplyr::across(year:after, ~ifelse(is.na(.x), 0, .x)))
ID X2014 X2015 X2016 X2017 X2018 year before after
1 123100 1 1 2 2 2 2016 1 2
2 123200 1 1 1 1 3 2018 1 3
3 123300 1 1 1 1 1 0 0 0
4 123400 2 2 2 2 2 0 0 0
5 123500 3 3 1 1 1 2016 3 1
another possibility to solve the task within the tidyverse is to work row-wise:
dplyr::rowwise(df) %>%
dplyr::mutate(year = readr::parse_number(names(.)[stringr::str_detect(names(.), pattern = "^X")][c(0, diff(dplyr::c_across(dplyr::starts_with("x")))) != 0][1]),
before = dplyr::c_across(dplyr::starts_with("x"))[dplyr::lead(c(0, diff(dplyr::c_across(dplyr::starts_with("x")))) != 0, default = FALSE)][1],
after = dplyr::coalesce(dplyr::c_across(dplyr::starts_with("x"))[dplyr::lag(c(0, diff(dplyr::c_across(dplyr::starts_with("x")))) != 0, default = FALSE)][1],
dplyr::c_across(dplyr::starts_with("x"))[c(0, diff(dplyr::c_across(dplyr::starts_with("x")))) != 0][1])) %>%
dplyr::ungroup() %>%
dplyr::mutate(dplyr::across(year:after, ~ ifelse(is.na(.x), 0, .x)))
ID X2014 X2015 X2016 X2017 X2018 year before after
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 123100 1 1 2 2 2 2016 1 2
2 123200 1 1 1 1 3 2018 1 3
3 123300 1 1 1 1 1 0 0 0
4 123400 2 2 2 2 2 0 0 0
5 123500 3 3 1 1 1 2016 3 1
matrixStats::rowDiffs might be helpful and faster here.
z <- apply(matrixStats::rowDiffs(as.matrix(df[-1])) != 0, 1, which.max) + 1; d <- dim(df)
m <- matrix(t(df[-1])[c(z + 0:(d[2] - 2)*d[1] - 1, z + 0:(d[2] - 2)*d[1])],,2, di=list(c(), c('before', 'after')))
cbind(df, `[<-`(cbind(change=1, year=substring(names(df[-1])[z], 2), m), z == 2, 1:4, 0))
# ID X2014 X2015 X2016 X2017 X2018 change year before after
# 1 123100 1 1 2 2 2 1 2016 1 2
# 2 123200 1 1 1 1 3 1 2018 1 3
# 3 123300 1 1 1 1 1 0 0 0 0
# 4 123400 2 2 2 2 2 0 0 0 0
# 5 123500 3 3 1 1 1 1 2016 3 1
Data:
df <- structure(list(ID = c(123100, 123200, 123300, 123400, 123500),
X2014 = c(1, 1, 1, 2, 3), X2015 = c(1, 1, 1, 2, 3), X2016 = c(2,
1, 1, 2, 1), X2017 = c(2, 1, 1, 2, 1), X2018 = c(2, 3, 1,
2, 1)), class = "data.frame", row.names = c(NA, -5L))

How to select row with exactly only 2 unique value with tidyverse?

What I have:
library(magrittr)
set.seed(1234)
what_i_have <- tibble::tibble(
A = c(0, 1) |> sample(5, replace = TRUE),
B = c(0, 1) |> sample(5, replace = TRUE),
C = c(0, 1) |> sample(5, replace = TRUE)
)
It looks like this:
> what_i_have
# A tibble: 5 x 3
A B C
<dbl> <dbl> <dbl>
1 1 1 1
2 1 0 1
3 1 0 1
4 1 0 0
5 0 1 1
What I want:
what_i_want <- what_i_have %>% .[apply(., 1, function(row) row |> unique() |> length() == 2),]
It looks like this:
# A tibble: 4 x 3
A B C
<dbl> <dbl> <dbl>
1 1 0 1
2 1 0 1
3 1 0 0
4 0 1 1
My question is: is there a tidyverse way to do the things above?
I tried this:
what_i_have |>
dplyr::rowwise() |>
dplyr::filter_all(function(row) row |> unique() |> length() == 2)
but it returns the following empty tibble and I do not know why
# A tibble: 0 x 3
# Rowwise:
# … with 3 variables: A <dbl>, B <dbl>, C <dbl>
Thank you.
Here is one option with tidyverse. Here, I treat each row as a vector (via c_across), then get the number of distinct values using n_distinct and return TRUE for the rows that have 2 unique values.
library(tidyverse)
what_i_have %>%
rowwise %>%
filter(n_distinct(c_across(everything())) == 2)
Output
A B C
<dbl> <dbl> <dbl>
1 0 1 1
2 1 0 1
3 1 0 0
4 1 1 0
A mixed method approach with apply could be:
what_i_have %>%
filter(apply(., 1, \(x)length(unique(x)))==2)
Data
what_i_have <-
structure(
list(
A = c(0, 1, 1, 1, 1),
B = c(1, 0, 0, 1, 1),
C = c(1, 1, 0, 1, 0)
),
class = c("tbl_df", "tbl", "data.frame"),
row.names = c(NA,-5L)
)

Create new variables by combining mutate and case_when in R

I have dataset with app interactions for different users. App interactions are saved as the number of days the feature has been interacted with by a user, for a specific week. Sample table looks like (commented out ID and weeknr for practical purposes):
tibble(
#id = (1, 1, 1),
#weeknr = (1, 2, 3),
var_1 = c(1, 2, 3, 2, 1),
var_2 = c(0, 0, 1, 4, 0),
var_3 = c(1, 1, 1, 0, 0)
)
Goal is now to create three new columns, based on var_{1|3}. If count on app interactions is > 1, assign 1 otherwise 0. I tried the following with no success:
tibble(
var_1 = c(1, 2, 3, 2, 1),
var_2 = c(0, 0, 1, 4, 0),
var_3 = c(1, 1, 1, 0, 0)
) %>%
mutate_all(
funs(case_when(
. > 0 ~ 1,
. == 0 ~ 0,
TRUE ~ NA
))
)
Any help is much appreciated!
Making use dplyr::across you could do:
library(dplyr)
tibble(
var_1 = c(1, 2, 3, 2, 1),
var_2 = c(0, 0, 1, 4, 0),
var_3 = c(1, 1, 1, 0, 0)
) %>%
mutate(across(everything(), ~ case_when(
.x > 0 ~ 1,
.x == 0 ~ 0,
TRUE ~ NA_real_
),
.names = "{.col}_new")
)
#> # A tibble: 5 × 6
#> var_1 var_2 var_3 var_1_new var_2_new var_3_new
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 1 0 1 1 0 1
#> 2 2 0 1 1 0 1
#> 3 3 1 1 1 1 1
#> 4 2 4 0 1 1 0
#> 5 1 0 0 1 0 0
As my first choice is already given by stefan, here an alternative way using bind_cols (generally rarley used) and ifelse:
library(dplyr)
df %>%
mutate(across(where(is.numeric), ~ ifelse(. > 0, 1,0))) %>%
bind_cols(df)
Output:
var_1...1 var_2...2 var_3...3 var_1...4 var_2...5 var_3...6
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 0 1 1 0 1
2 1 0 1 2 0 1
3 1 1 1 3 1 1
4 1 1 0 2 4 0
5 1 0 0 1 0 0
Using as.integer or +
library(dplyr)
df %>%
mutate(across(where(is.numeric), ~ +(. > 0), .names = "{.col}_new"))
-ouputt
# A tibble: 5 x 6
var_1 var_2 var_3 var_1_new var_2_new var_3_new
<dbl> <dbl> <dbl> <int> <int> <int>
1 1 0 1 1 0 1
2 2 0 1 1 0 1
3 3 1 1 1 1 1
4 2 4 0 1 1 0
5 1 0 0 1 0 0

Assigning values to a column in the based on values of another column in the same dataframe in R

I have a dataframe with 3 columns and I want to assign values to a fourth column of this dataframe if the sum of a condition is met in another row. In this example I want to assign 1 to df[,4], if df[,3]>=2 for each row.
An example of what I want as the output is:
Any help is appreciated.
Thank you,
library(tidyverse)
data <-
tribble(
~ID, ~time1, ~time2,
'jkjkdf', 1, 1,
'kjkj', 1, 0,
'fgf', 1, 1,
'jhkj', 0, 1,
'hgd', 0,0
)
mutate(data, label = if_else(time1 + time2 >= 2, 1, 0))
#> # A tibble: 5 x 4
#> ID time1 time2 label
#> <chr> <dbl> <dbl> <dbl>
#> 1 jkjkdf 1 1 1
#> 2 kjkj 1 0 0
#> 3 fgf 1 1 1
#> 4 jhkj 0 1 0
#> 5 hgd 0 0 0
#or with n time columns
data %>%
rowwise() %>%
mutate(label = if_else(sum(across(starts_with('time'))) >= 2, 1, 0))
#> # A tibble: 5 x 4
#> # Rowwise:
#> ID time1 time2 label
#> <chr> <dbl> <dbl> <dbl>
#> 1 jkjkdf 1 1 1
#> 2 kjkj 1 0 0
#> 3 fgf 1 1 1
#> 4 jhkj 0 1 0
#> 5 hgd 0 0 0
Created on 2021-06-06 by the reprex package (v2.0.0)
Do you want to assign 1 if both time1 and time2 are 1 ?
If there are only two columns you can do -
df$label <- as.integer(df$time1 == 1 & df$time2 == 1)
If there are many such time columns we can take help of rowSums -
cols <- grep('time', names(df))
df$label <- as.integer(rowSums(df[cols] == 1) == length(cols))
df
# a time1 time2 label
#1 a 1 1 1
#2 b 1 0 0
#3 c 1 1 1
#4 d 0 1 0
#5 e 0 0 0
data
Images are not the right way to share data, provide them in a reproducible format.
df <- data.frame(a = letters[1:5],
time1 = c(1, 1, 1, 0, 0),
time2 = c(1, 0, 1, 1, 0))
We could do thin in a vectorized way using tidyverse methods - select the columns that starts_with 'time' in column name, reduce it to a single vector by adding (+) the corresponding elements, use the aliases from magrittr to convert it to binary for creating the 'label' column. Finally, the object should be assigned (<-) to original data if we want the original object to be changed
library(dplyr)
library(purrr)
library(magrittr)
df %>%
mutate(label = select(cur_data(), starts_with('time')) %>%
reduce(`+`) %>%
is_weakly_greater_than(2) %>%
multiply_by(1))
a time1 time2 label
1 a 1 1 1
2 b 1 0 0
3 c 1 1 1
4 d 0 1 0
5 e 0 0 0
data
df <- structure(list(a = c("a", "b", "c", "d", "e"), time1 = c(1, 1,
1, 0, 0), time2 = c(1, 0, 1, 1, 0)), class = "data.frame", row.names = c(NA,
-5L))

Create new columns based on comma-separated values in another column in R [duplicate]

This question already has answers here:
Convert column with pipe delimited data into dummy variables [duplicate]
(4 answers)
Closed 2 years ago.
I have some data similar to that below.
df <- data.frame(id = 1:5, tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"))
df
# id tags
# 1 1 A,B,AB,C
# 2 2 C
# 3 3 AB,E
# 4 4 <NA>
# 5 5 B,C
I'd like to create a new dummy variable column for each tag in the "tags" column, resulting in a dataframe like the following:
correct_df <- data.frame(id = 1:5,
tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"),
A = c(1, 0, 0, 0, 0),
B = c(1, 0, 0, 0, 1),
C = c(1, 1, 0, 0, 1),
E = c(0, 0, 1, 0, 0),
AB = c(1, 0, 1, 0, 0)
)
correct_df
# id tags A B C E AB
# 1 1 A,B,AB,C 1 1 1 0 1
# 2 2 C 0 0 1 0 0
# 3 3 AB,E 0 0 0 1 1
# 4 4 <NA> 0 0 0 0 0
# 5 5 B,C 0 1 1 0 0
One of the challenges is ensuring that the "A" column has 1 only for the "A" tag, so that it doesn't has 1 for the "AB" tag, for example. The following won't work for this reason, since "A" gets 1 for the "AB" tag:
df <- df %>%
mutate(A = ifelse(grepl("A", tags, fixed = T), 1, 0))
df
# id tags A
# 1 1 A,B,AB,C 1
# 2 2 C 0
# 3 3 AB,E 1 < Incorrect
# 4 4 <NA> 0
# 5 5 B,C 0
Another challenge is doing this programmatically. I can probably deal with a solution that manually creates a column for each tag, but a solution that doesn't assume which tag columns need to be created beforehand is best, since there can potentially be many different tags. Is there some relatively simple solution that I'm overlooking?
Does this work:
> library(tidyr)
> library(dplyr)
> df %>% separate_rows(tags) %>% mutate(A = case_when(tags == 'A' ~ 1, TRUE ~ 0),
+ B = case_when(tags == 'B' ~ 1, TRUE ~ 0),
+ C = case_when(tags == 'C' ~ 1, TRUE ~ 0),
+ E = case_when(tags == 'E' ~ 1, TRUE ~ 0),
+ AB = case_when(tags == 'AB' ~ 1, TRUE ~ 0)) %>%
+ group_by(id) %>% mutate(tags = toString(tags)) %>% group_by(id, tags) %>% summarise(across(A:AB, sum))
`summarise()` regrouping output by 'id' (override with `.groups` argument)
# A tibble: 5 x 7
# Groups: id [5]
id tags A B C E AB
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 A, B, AB, C 1 1 1 0 1
2 2 C 0 0 1 0 0
3 3 AB, E 0 0 0 1 1
4 4 NA 0 0 0 0 0
5 5 B, C 0 1 1 0 0
>
Here's a solution:
library(dplyr)
library(stringr)
library(magrittr)
library(tidyr)
#Data
df <- data.frame(id = 1:5, tags = c("A,B,AB,C", "C", "AB,E", NA, "B,C"))
#Separate into rows
df %<>% mutate(t2 = tags) %>% separate_rows(t2, sep = ",")
#Create a presence/absence column
df %<>% mutate(pa = 1)
#Pivot wider and use the presence/absence
#column as entries; fill with 0 if absent
df %<>% pivot_wider(names_from = t2, values_from = pa, values_fill = 0)
df
# # A tibble: 5 x 8
# id tags A B AB C E `NA`
# <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 A,B,AB,C 1 1 1 1 0 0
# 2 2 C 0 0 0 1 0 0
# 3 3 AB,E 0 0 1 0 1 0
# 4 4 NA 0 0 0 0 0 1
# 5 5 B,C 0 1 0 1 0 0
Edit: updated the code to enable it to retain the tags column. Sorry.

Resources