Ifelse conditional on same strings in multiple columns - r

So I guess this is possible to achieve by just making a veeery long line code using mutate() and ifelse() but I want to know if there is a way of doing it without writing a tone of code.
I have data where the degree of each individual is written in a non-ordered fashion. The data looks like this:
id <- c(1, 2, 3, 4, 5, 6)
degree1 <- c("masters", "bachelors", "PhD", "bachelors", "bachelors", NA)
degree2 <- c("PhD", "masters", "bachelors", NA, NA, NA)
degree3 <- c("bachelors", NA, "masters", NA, "masters", NA)
Now I want to create a new column containing the string for the highest degree, like this
dat$highest_degree <- c("PhD", "masters", "PhD", "bachelors", "masters", NA)
How can I achieve this?

An option is to loop over the rows for the selected 'degree' column, convert to factor with levels specified in the order, drop the levels to remove the unused levels and select the first level
v1 <- c("PhD", "masters", "bachelors")
dat$highest_degree <- apply(dat[-1], 1, function(x)
levels(droplevels(factor(x, levels = v1)))[1])
dat$highest_degree
#[1] "PhD" "masters" "PhD" "bachelors" "masters" NA
Or using tidyverse, reshape into 'long' format, then slice the first row after arrangeing the long format column by matching with an ordered degree vector and grouping by 'id', then join with the original data
library(dplyr)
library(tidyr)
dat %>%
pivot_longer(cols = starts_with('degree'), values_to = 'highest_degree') %>%
select(-name) %>%
arrange(id, match(highest_degree, v1)) %>%
group_by(id) %>%
slice_head(n = 1) %>%
ungroup %>%
left_join(dat, .)
data
dat <- data.frame(id, degree1, degree2, degree3)

Here is a base R option using pmin + factor
lvs <- c("PhD", "masters", "bachelors")
dat$highest_degree <- lvs[
do.call(
pmin,
c(asplit(matrix(as.integer(factor(as.matrix(dat[-1]), levels = lvs)), nrow(dat)), 2),
na.rm = TRUE
)
)
]
which gives
> dat
id degree1 degree2 degree3 highest_degree
1 1 masters PhD bachelors PhD
2 2 bachelors masters <NA> masters
3 3 PhD bachelors masters PhD
4 4 bachelors <NA> <NA> bachelors
5 5 bachelors <NA> masters masters
6 6 <NA> <NA> <NA> <NA>
Data
> dput(dat)
structure(list(id = c(1, 2, 3, 4, 5, 6), degree1 = c("masters",
"bachelors", "PhD", "bachelors", "bachelors", NA), degree2 = c("PhD",
"masters", "bachelors", NA, NA, NA), degree3 = c("bachelors",
NA, "masters", NA, "masters", NA)), class = "data.frame", row.names = c(NA,
-6L))

Related

Replace NA with text for specific ids

Based on the code and data below how can I replace NAs in column a with Transportation Element Maps 2012 based on specific ids in column id?
Code:
# Sample df
df = structure(list(id = c(1, 2, 3, 4, 5, 6, 7), a = c("a", "Transportation Element Maps 2012", NA, NA, NA, NA, NA)), class = "data.frame", row.names = c(NA,
-7L))
# Desired df
df1 = structure(list(id = c(1, 2, 3, 4, 5, 6, 7), a = c("a", "Transportation Element Maps 2012", "Transportation Element Maps 2012", "Transportation Element Maps 2012", "Transportation Element Maps 2012", "Transportation Element Maps 2012", NA)), class = "data.frame", row.names = c(NA,
-7L))
# Current approach which throws an error
df1 = df %>% ifelse(id %in% 3:6) %>% mutate(a %in% NA, "Transportation Element Maps 2012")
# Error
Error in ifelse(., reference_number %in% 3:6) :
'list' object cannot be coerced to type 'logical'
Use is.na to find NA elements that returns a logical vector, as well as keep the id %in% 3:6 within mutate
library(dplyr)
df %>%
mutate(a = ifelse(id %in% 3:6 & is.na(a),
"Transportation Element Maps 2012", a))
You could use the replace function and combine 3:6 and NA together using c as follow:
library(dplyr)
df |>
mutate(a = replace(a, id %in% c(3:6, NA), "Transportation Element Maps 2012"))

apply custom-made function to column pairs and create summary table

I have data with ratings on many parameters by two different raters; here are shown just a snippet of ratings on three same-prefix parameters (e.g. DH and DH_ptak):
df <- structure(list(DH = c(0, 1, NA, NA, 1, 1, 1, 1, 1, 1),
DH_ptak = c(0, 1, 1, 1, 1, 1, 1, 1, 1, 1),
SZ = c(1, 1, NA, NA, NA, 0, 1, 0, 1, 1),
SZ_ptak = c(1, 1, NA, NA, NA, 1, 0, NA, 1, 1),
RM = c(0, 1, 1, NA, NA, NA, 0, NA, 1, NA),
RM_ptak = c(0, 1, 1, 1, 1, NA, 0, 1, NA, 1)),
row.names = c(NA, 10L), class = "data.frame")
For each parameter I want to compare the two ratings columns. I use this function to find different ratings:
compare_fun <- function(c1, c2){
case_when(is.na(c1) & is.na(c2) ~ 0,
is.na(c1) | is.na(c2) ~ 1,
c1 != c2 ~ 1,
TRUE ~ 0)
}
I can use this function to sum the differences and compute an agreement percentage agree_pct:
library(dplyr)
df %>%
mutate(diff = compare_fun(DH, DH_ptak)) %>%
summarise(sum = sum(diff),
agree_pct = (nrow(df)-sum)/nrow(df)*100)
sum agree_pct
1 2 80
The problem is that I have multiple parameters. How can I compute for all ratings-column pairs the respective sum and agree_pct in one go, ideally, to obtain a table like this:
sum agree_pct
DH 2 80
SZ 3 70
RM 5 50
This is what I would do. It mostly involves pivoting the data a few times. First I make a column from row names so that I can use this to keep all the rows straight, then I go from wide to long with pivot_longer. I separate the column names to delineate between the two reviewers and assign them the names "grp1" and "grp2". Then I pivot_wider so that you have 2 columns, one for each reviewer. Lastly I apply your function across all the data, group by the variable of interest and summarize the data.
library(tidyverse)
df %>%
rownames_to_column("col") %>%
pivot_longer( -col) %>%
separate(name, into = c("var", "tmp"), sep = "_") %>%
mutate(grp = ifelse(is.na(tmp), "grp1", "grp2")) %>%
select(col, var, value, grp) %>%
pivot_wider(names_from = grp, values_from = value) %>%
mutate(diff = compare_fun(grp1, grp2)) %>%
group_by(var) %>%
summarise(sum = sum(diff),
agree_pct = (nrow(df)-sum)/nrow(df)*100)
#> # A tibble: 3 x 3
#> var sum agree_pct
#> <chr> <dbl> <dbl>
#> 1 DH 2 80
#> 2 RM 5 50
#> 3 SZ 3 70

calculate the mean of column and also the comments in next column

I want to calculate the mean of column and and also concatenate the texts in second column output.
for example in below i want to calculate the mean of C1 and then concatenate all texts in C1T in next column if there is more than one text in C1T.
df <- data.frame(A1 = c("class","type","class","type","class","class","class","class","class"),
B1 = c("b2","b3","b3","b1","b3","b3","b3","b2","b1"),
C1=c(6, NA, 1, 6, NA, 1, 6, 6, 2),
C1T=c(NA, "Part of other business", NA, NA, NA, NA, NA, NA, NA),
C2=c(NA, 4, 1, 2, 4, 4, 3, 3, NA),
C2T=c(NA, NA, NA, NA, NA, NA, NA, NA, NA),
C3=c(3, 4, 3, 3, 6, NA, 2, 4, 1),
C3T=c(NA, NA, NA, NA, "two part are available but not in source", NA, NA, NA, NA),
C4=c(5, 5, 2, NA, NA, 6, 4, 1, 2),
C5T=c(NA, NA, NA, NA, NA, NA, NA, "Critical Expert", NA),
C5=c(6, 2, 6, 4, 2, 2, 5, 4, 1),
C5T=c(NA, NA, NA, NA, NA, "most of things are stuck", "weather responsible", NA, NA))
var <- "C1"
var1 <- "C1T"
var <- rlang::parse_expr(var)
var1 <- rlang::parse_expr(var1)
df1 <- df%>%filter(A1 == "class")
T1<- df1 %>%group_by(B1)%>%summarise(mean=round(mean(!!var,na.rm = TRUE),1))
Comments <- df1 %>% group_by(B1) %>% summarise_at(vars(var1), paste0, collapse = " ") %>%
select(var1) %>% unlist() %>% gsub("NA","",.) %>% stringi::stri_trim_both()
cbind(T1,Comments)
Edited Answer:
var <- "C1"
var1 <- "C1T"
filtercol <- "A1"
filterval <- "class"
groupingvar <- "B1"
var <- rlang::parse_expr(var)
var1 <- rlang::parse_expr(var1)
filtercol <- rlang::parse_expr(filtercol)
groupingvar <- rlang::parse_expr(groupingvar)
library(dplyr)
df1 <- df %>% filter(!!filtercol == filterval)
T1 <- df1 %>% group_by(!!groupingvar) %>% summarise(mean=round(mean(as.numeric(!!var),na.rm = TRUE),1))
Comments <- df1 %>% select(!!groupingvar, !!var1) %>%
group_by(!!groupingvar) %>%
summarise_at(vars(!!var1), paste0, collapse = " ") %>%
select(!!var1) %>% unlist() %>% gsub("NA", "", .) %>%
stringi::stri_trim_both()
T1 <- cbind(T1,Comments)
Update on OP's request (see comments):
library(dplyr)
# helper function to coalesce by column
coalesce_by_column <- function(df) {
return(coalesce(df[1], df[2]))
}
df %>%
pivot_longer(
cols = contains("T"),
names_to = "names",
values_to = "values"
) %>%
filter(names == "C1T") %>%
group_by(names) %>%
summarise(Mean = mean(c_across(C1:C5 & where(is.numeric)), na.rm = TRUE),
Comments = coalesce_by_column(values))
Output:
names Mean Comments
<chr> <dbl> <chr>
1 C1T 3.47 Part of other business
First answer
coalesce to construct Comments column
rowwise with c_across to calculate the mean rowwise.
In case you need to group, you can use ``group_by`
library(dplyr)
df %>%
mutate(Comments = coalesce(C1T, C2T, C3T, C4T, C5T),.keep="unused") %>%
rowwise() %>%
mutate(Mean = mean(c_across(C1:C5 & where(is.numeric)), na.rm = TRUE)) %>%
select(A1, B1, Mean, Comments)
Output:
A1 B1 Mean Comments
<chr> <chr> <dbl> <chr>
1 class b2 5 NA
2 type b3 3.75 Part of other business
3 class b3 2.6 NA
4 type b1 3.75 NA
5 class b3 4 two part are available but not in source
6 class b3 3.25 most of things are stuck
7 class b3 4 weather responsible
8 class b2 3.6 Critical Expert
9 class b1 1.5 NA

r: create data frame with all possible options and number of variable combinations

This question might be obvious or asked already, but I can't find a solution:
I want to create a data frame with all possible combinations (and number of variables) such that it looks like the following example:
dataframe <- data.frame(variable = 1:4,
a = c("gender", NA, NA, NA),
b = c("age", NA, NA, NA),
c = c("city", NA, NA, NA),
d = c("education", NA, NA, NA),
e = c("gender", "age", NA, NA),
f = c("gender", "city", NA, NA),
g = c("gender", "education", NA, NA),
h = c("age", "city", NA, NA),
i = c("age", "education", NA, NA),
j = c("city", "education", NA, NA),
k = c("gender", "age", "city", NA),
l = c("gender", "age", "education", NA),
m = c("gender", "city", "education", NA),
n = c("gender", "age", "city", "education"))
I have too many variables, so it's not worth writing it out, and I want to avoid errors. Thank you for helping!
Here is an option with combn. Get the vector of variable names, loop through the sequence of the vector, apply the combn on the vector with m specified as the sequence from the loop, convert to data.frame and cbind all the list elements together. The cbind.fill from rowr is suitable to fill with NA for list elements that have less number of rows than the maximum row data.frame
library(rowr)
res <- do.call(cbind.fill, c(fill = NA, lapply(seq_along(v1), function(i) {
m1 <- combn(v1, i)
if(is.vector(m1)) as.data.frame.list(m1) else as.data.frame(m1)})))
colnames(res) <- letters[seq_along(res)]
Or as #Moody_Mudskipper suggested,
res1 <- do.call(cbind.fill, c(fill = NA, lapply(seq_along(v1), function(i) combn(v1, i))))
colnames(res1) <- letters[seq_len(ncol(res1))]
data
v1 <- c('gender', 'age', 'city', 'education')

Multiple columns processing and dynamically naming new columns

Variables are mistakenly being entered into multiple columns eg: "aaa_1", "aaa_2" and "aaa_3", or "ccc_1, "ccc_2", and "ccc_3"). Need to create single new columns (eg "aaa", or "ccc"). Some variables are currently in a single column though ("hhh_1"), but more columns may be added (hhh_2 etc).
This is what I got:
aaa_1 <- c(43, 23, 65, NA, 45)
aaa_2 <- c(NA, NA, NA, NA, NA)
aaa_3 <- c(NA, NA, 92, NA, 82)
ccc_1 <- c("fra", NA, "spa", NA, NA)
ccc_2 <- c(NA, NA, NA, "wez", NA)
ccc_3 <- c(NA, "ija", NA, "fda", NA)
ccc_4 <- c(NA, NA, NA, NA, NA)
hhh_1 <- c(183, NA, 198, NA, 182)
dataf1 <- data.frame(aaa_1,aaa_2,aaa_3,ccc_1,ccc_2, ccc_3,ccc_4,hhh_1)
This is what I want:
aaa <- c(43, 23, NA, NA, NA)
ccc <- c("fra", "ija", "spa", NA, NA)
hhh <- c(183, NA, 198, NA, 182)
dataf2 <- data.frame(aaa,ccc,hhh)
General solution needed as there are ~100 variables (eg "aaa", "hhh", "ccc", "ttt", "eee", "hhh"etc).
Thanks!
This is a base solution, i.e. no packages.
First define get_only which when given a list converts it to a data.frame and applies get_only to each row. When given a vector it returns the single non-NA in it or NA if there is not only one.
Define root to be the column names without the suffixes.
Convert the data frame to a list of columns, group them by root and apply get_only to each such group.
Finally, convert the resulting list to a data frame.
get_only <- function(x) UseMethod("get_only")
get_only.list <- function(x) apply(data.frame(x), 1, get_only)
get_only.default <- function(x) if (sum(!is.na(x)) == 1) na.omit(x) else NA
root <- sub("_.*", "", names(dataf1))
as.data.frame(lapply(split(as.list(dataf1), root), FUN = get_only))
giving:
age country hight
1 43 fra 183
2 23 ija NA
3 NA spa 198
4 NA <NA> NA
5 NA <NA> 182
We may try with splitstackshape
library(splitstackshape)
nm1 <- sub("_\\d+", "", names(dataf1))
tbl <- table(nm1) > 1
merged.stack(dataf1, var.stubs = names(tbl)[tbl], sep="_")
I'm not sure your example is right. For example in the third row you've got values for both age_1 and age_3, then in the desired output NA for that row.
If I've understood what you're trying to do though, it will be much easier if you transpose columns to rows, fix them and then transpose back again. Try this as a start point using the 'tidyverse' of dplyr and tidyr.
library(tidyverse)
library(stringr)
age_1 <- c(43, 23, 65, NA, 45)
age_2 <- c(NA, NA, NA, NA, NA)
age_3 <- c(NA, NA, 92, NA, 82)
country_1 <- c("fra", NA, "spa", NA, NA)
country_2 <- c(NA, NA, NA, "wez", NA)
country_3 <- c(NA, "ija", NA, "fda", NA)
country_4 <- c(NA, NA, NA, NA, NA)
hight_1 <- c(183, NA, 198, NA, 182)
dataf1 <- data.frame(age_1,age_2,age_3,country_1,country_2, country_3,country_4,hight_1)
data <- dataf1 %>%
mutate(row_num = row_number()) %>% #create a row number to track values
gather(key, value, -row_num) %>% #flatten your data
drop_na() %>% #drop na rows
mutate(key = str_replace(key, "_.", "")) %>% #remove the '_x' part of names
group_by(row_num) %>%
top_n(1) %>%
spread(key, value) #pivot back to columns
For your example you need the group_by() and top_n() lines to make it run because you've got multiple values in the same row. If you only have one value (as I think you should?) then you can remove these two lines. It will be better without them because then it won't run if your data is wrong.
Edit following comment below. This will make any duplicated entries NA.
data <- dataf1 %>%
mutate(row_num = row_number()) %>% #create a row number to track values
gather(key, value, -row_num) %>% #flatten your data
drop_na() %>% #drop na rows
mutate(key = str_replace(key, "_.", "")) %>% #remove the '_x' part of names
group_by(row_num, key) %>%
mutate(count = n()) %>% #count how many entries for each row/key combo
mutate(value = ifelse(count > 1, NA, value)) %>% #set NA for rows with duplicates
drop_na() %>%
spread(key, value) %>% #pivot back to columns
select(-count) #drop the `count` variable

Resources