Which field is different using dplyr? - r

I have the following data
df <- data.frame(val1=c(1.2,0.5,3.8,2.5,7.4),
val2=c(1.2,2.5,3.8,2.5,2.4),
val3=c(1.2,2.5,3.6,2.5,7.4),
val4=c(1.2,2.5,3.8,2.5,4.4),
val5=c(1.2,2.5,3.8,2.9,7.4))
I'd like to find which field is different to the rest. Expecting a result like below to be added to the data frame
cbind(df,results = c("all_equal", "val1","val3","val5","morethan1"))
Is there any way to do this in an easy way? I have an extensive loop to get this result which I won't post here. I'm looking for a quick solution that I've missed to see (maybe using dplyr)

First, define a function to calculate the mode. I used the function found here: https://stackoverflow.com/a/8189441/7669809
Modes <- function(x) {
ux <- unique(x)
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}
After that, we can use the following code to get the desired output.
apply(df, 1, function(x){
x_mode <- Modes(x)
if (all(x == x_mode)){
return("all_equal")
} else if (sum(x != x_mode) > 1){
return("morethan1")
} else {
ind <- which(x != x_mode)
return(paste0("val", ind))
}
})
# [1] "all_equal" "val1" "val3" "val5" "morethan1"

Here is one option with tidyverse by reshaping the data into 'long' format, apply the conditions to create the 'result' column and then bind the column with the original dataset
library(tidyverse)
rownames_to_column(df, 'rn') %>%
gather(key, val, matches('^val')) %>%
group_by(rn) %>%
mutate(Mode = Modes(val)) %>%
summarise(result = case_when(all(val == Mode) ~ "all_equal",
sum(val != Mode) > 1 ~ "morethan1",
TRUE ~ paste0("val", which(val != Mode), collapse=","))) %>%
select(result) %>%
bind_cols(df, .)
# val1 val2 val3 val4 val5 result
#1 1.2 1.2 1.2 1.2 1.2 all_equal
#2 0.5 2.5 2.5 2.5 2.5 val1
#3 3.8 3.8 3.6 3.8 3.8 val3
#4 2.5 2.5 2.5 2.5 2.9 val5
#5 7.4 2.4 7.4 4.4 7.4 morethan1
The Modes function
Modes <- function(x) {
ux <- unique(x)
tab <- tabulate(match(x, ux))
ux[tab == max(tab)]
}

Related

Indexing problem while adding values defined by multiple other columns of tibbles within a list

I have a named list containing 4 tibbles:
list_all <- list("iris_a" = iris,
"iris_b" = iris,
"iris_c" = iris,
"iris_d" = iris)
Now I'd like to add for example the Petal.Length of the flower that has a Sepal.Length of 6.8 and Species == versicolor to the Petal.Length of the flower that has a Sepal.Length of 7.0 and Species == versicolor in every one of those four tibbles.
I can do this hardcoded with:
list_all[['iris_a']][51,3] <- list_all[['iris_a']][51,3] + list_all[['iris_a']][77,3]
list_all[['iris_b']][51,3] <- list_all[['iris_b']][51,3] + list_all[['iris_b']][77,3]
list_all[['iris_c']][51,3] <- list_all[['iris_c']][51,3] + list_all[['iris_c']][77,3]
list_all[['iris_d']][51,3] <- list_all[['iris_d']][51,3] + list_all[['iris_d']][77,3]
but trying to grab the value with something like
list_all[['iris_a']]['Sepal.Length' == 7.0 & 'Species' == 'versicolor', 'Petal.Width']
results in numeric(0).
I'm thankfull for any advice!
'Sepal.Length' == 7.0 is FALSE. Always. The string 'Sepal.Length' is not equal to the number 7. There's nothing in your code that tells base R that you intend Sepal.Length as a column name. (And similarly for 'Species' == 'versicolor'.)
You can get it like this:
list_all[['iris_a']][
list_all[['iris_a']][['Sepal.Length']] == 7.0 &
list_all[['iris_a']][['Species']] == 'versicolor',
'Petal.Width']
I think an lapply might be much nicer if you're doing the same thing to every data frame in the list:
list_all <- lapply(list_all, \(df) {
df[ ## cell to replace
df[['Sepal.Length']] == 7.0 &
df[['Species']] == 'versicolor',
'Petal.Width'
] <-
## first value to sum
df[
df[['Sepal.Length']] == 7.0 &
df[['Species']] == 'versicolor',
'Petal.Width'
] + df[ ## second value to sum
df[['Sepal.Length']] == 6.8 &
df[['Species']] == 'versicolor',
'Petal.Width'
]
## return the modified df
df
})
Thanks Gregor! I used this code following your advice, tried the %<>% that sindri recommended and it works now quite well.
list_all <- list_all %>%
map(\(df) {
df[df[["Sepal.Length"]] == 7.0 & df[["Species"]] == "versicolor", "Petal.Width"] %<>%
add(df[df[["Sepal.Length"]] == 6.8 & df[["Species"]] == "versicolor", "Petal.Width"])
df
})

Error in 'group by' stating its a character for looping through summarise R

still getting to grips with R, and as a newbie, as I have been doing most of my coding manual, ie copy and paste the same block of code 20 times. Here, I was trying to learn about looping and summarising. When I do the summarise with one individual dataset, it works fine, but now I try and loop it, it states its a character, so I added in 'as.numeric' to no prevail. Any advice would be helpful.
Error:
Error in UseMethod("group_by_") :
no applicable method for 'group_by_' applied to an object of class "character"
#educationYears
fiv0_education <- subset(sf_education, Year == '2005')
six0_education <- subset(sf_education, Year == '2006')
sev0_education <- subset(sf_education, Year == '2007')
eig0_education <- subset(sf_education, Year == '2008')
nin0_education <- subset(sf_education, Year == '2009')
ten_education <- subset(sf_education, Year == '2010')
one_education <- subset(sf_education, Year == '2011')
two_education <- subset(sf_education, Year == '2012')
thr_education <- subset(sf_education, Year == '2013')
for_education <- subset(sf_education, Year == '2014')
fiv_education <- subset(sf_education, Year == '2015')
six_education <- subset(sf_education, Year == '2016')
sev_education <- subset(sf_education, Year == '2017')
eig_education <- subset(sf_education, Year == '2018')
nin_education <- subset(sf_education, Year == '2019')
names <- c('fiv0', 'six0', 'sev0', 'eig0', 'nin0', 'ten', 'one', 'two', 'thr', 'for', 'fiv', 'six', 'sev', 'eig', 'nin')
test <- vector("list", length(names))
for (i in 1:length(names)) {
test[i] <- paste(names[i], '_education', sep = "", collapse = NULL) %>%
group_by(as.numeric(as.character(Kod))) %>%
summarise(Count=sum(as.numeric(as.character(Count))))
}
Here is a solution using built-in data set iris as an example. I believe it's easy to adapt to the problem in the question.
1. A solution with a for loop, like in the question.
library(dplyr)
names <- c('fiv0', 'six0', 'sev0')
test <- vector("list", length(names))
for (i in 1:length(names)) {
tmp <- paste0(names[i], '_education')
test[[i]] <- get(tmp, envir = .GlobalEnv) %>%
mutate(Count = as.numeric(as.character(Count))) %>%
group_by(Kod) %>%
summarise(Count = sum(Count))
}
test
#[[1]]
## A tibble: 4 x 2
# Kod Count
# <int> <dbl>
#1 1 1.6
#2 2 3.7
#3 3 2.4
#4 4 4.6
#
#[[2]]
## A tibble: 4 x 2
# Kod Count
# <int> <dbl>
#1 1 24.5
#2 2 27.2
#3 3 19.1
#4 4 30.5
#
#[[3]]
## A tibble: 4 x 2
# Kod Count
# <int> <dbl>
#1 1 15.9
#2 2 18.9
#3 3 15.5
#4 4 16
2. Here is another way, with purrr::map.
This code uses the data set already split in several with subset.
paste0(names, '_education') %>%
mget(envir = .GlobalEnv) %>%
purrr::map(
function(X){
X %>%
mutate(Count = as.numeric(as.character(Count))) %>%
group_by(Kod) %>%
summarise(Count = sum(Count))
}
)
3. Another purrr:map way, but this time from the original data set, with no need to subset multiple times first.
Note that the splitting column here is Species, in the question it's Year.
df1 %>%
group_split(Species) %>%
purrr::map(
function(X){
X %>%
mutate(Count = as.numeric(as.character(Count))) %>%
group_by(Kod) %>%
summarise(Count = sum(Count))
}
)
Data creation code.
set.seed(1234)
df1 <- iris[4:5]
names(df1)[1] <- "Count"
df1$Kod <- sample(4, 150, TRUE)
fiv0_education <- subset(df1, Species == 'setosa')
six0_education <- subset(df1, Species == 'virginica')
sev0_education <- subset(df1, Species == 'versicolor')

Equivalent of next in purrr::map_df

I am looking for the equivalent of next in loops for a purrr::map_df call.
map_df plays nicely with dataframes that are NULL (as in the example below), so it works when I set Result <- NULL in my example below.
Could anyone suggest a general solution to my illustration below that would not require me setting Result <- NULL, but rather immediately go "next".
library(tidyverse)
set.seed(1000)
df <- data.frame(x = rnorm(100), y = rnorm(100), z = rep(LETTERS, 100))
Map_Func <- function(df) {
Sum_Num <- suppressWarnings(sqrt(sum(df$y)))
if( Sum_Num == "NaN" ) {
Result <- NULL
# I would like to have an equivalent to "next" here...
} else {
Result <- df %>% filter(y == max(y)) %>% mutate(Result = x*y)
}
Result
}
Test <- split(df, df$z) %>% map_df(~Map_Func(.))
In the code above, what can I use instead of Result <- NULL in the ugly if statement (i.e. I want to simply check a condition and effectively do a "next").
To exit a function you can use the return(<output>) command. This immediately exits the function with the output you define. The following gives the same output you were getting with your sample code.
library(tidyverse)
set.seed(1000)
df <- data.frame(x = rnorm(100), y = rnorm(100), z = rep(LETTERS, 100))
Map_Func <- function(df) {
Sum_Num <- suppressWarnings(sqrt(sum(df$y)))
if( Sum_Num == "NaN" ) {
return(NULL)
}
Result <- df %>% filter(y == max(y)) %>% mutate(Result = x*y)
}
Test <- split(df, df$z) %>% map_df(~Map_Func(.))
Logic wise not a very different solution than OP but trying to keep it clean by using separate functions. custom_check function is to check the condition for each group. Using map_if we apply the function Map_Func_true only when custom_check returns TRUE or else apply Map_Func_false which returns NULL and finally bind the rows.
library(tidyverse)
Map_Func_true <- function(df) {
df %>% filter(y == max(y)) %>% mutate(Result = x*y)
}
Map_Func_false <- function(df) { return(NULL) }
custom_check <- function(df) {
!is.nan(suppressWarnings(sqrt(sum(df$y))))
}
df %>%
group_split(z) %>%
map_if(., custom_check, Map_Func_true, .else = Map_Func_false) %>%
bind_rows()
# A tibble: 26 x 4
# x y z Result
# <dbl> <dbl> <fct> <dbl>
# 1 1.24 2.00 A 2.47
# 2 1.24 2.00 A 2.47
# 3 1.24 2.00 C 2.47
# 4 1.24 2.00 C 2.47
# 5 1.24 2.00 E 2.47
# 6 1.24 2.00 E 2.47
# 7 1.24 2.00 G 2.47
# 8 1.24 2.00 G 2.47
# 9 1.24 2.00 I 2.47
#10 1.24 2.00 I 2.47
# … with 16 more rows
Here's another way of looking at it using purrr::safely
Map_Func <- function(df) {
Sum_Num <- suppressWarnings(sqrt(sum(df$y)))
df %>% filter(y == max(y)) %>% mutate(Result = x*y)
}
Test <- split(df, df$z) %>%
map(safely(~Map_Func(.))) %>%
transpose() %>%
pluck("result") %>% # use 'error' here to get the error log
bind_rows()
This way the function becomes cleaner and you also get a nice log of errors

Applying function to each group and column of R dataframe

I need to apply this function
replace_outliers <- function(column) {
qnt <- quantile(column, probs=c(.25, .75))
upper_whisker <- 1.5 * IQR(column)
clean_data <- column
clean_data[column > (qnt[2] + upper_whisker)] <- median(column)
clean_data
}
to dataset that look like this:
Category a b c
a 2.0 5.0 -5.0
a 1.5 10.0 10.0
b 3.2 14.5 100.2
... ... ... ...
I have to apply replace_outliers to each category apart and for each column. How to achieve that?
You can use the package dplyr. Use group_by to do it for each Category and mutate_if to apply the function to all numerical columns
library(dplyr)
df <- read.table(header = TRUE, text =
" Category a b c
a 2.0 5.0 -5.0
a 1.5 10.0 10.0
b 3.2 14.5 100.2")
replace_outliers <- function(column) {
qnt <- quantile(column, probs=c(.25, .75))
upper_whisker <- 1.5 * IQR(column)
clean_data <- column
clean_data[column > (qnt[2] + upper_whisker)] <- median(column)
clean_data
}
df %>% group_by(Category) %>%
mutate_if(is.numeric, replace_outliers)
Use mutate_all within a group_by:
library(dplyr)
DF %>%
group_by(Category) %>%
mutate_all(replace_outliers) %>%
ungroup
Consider base R with by (to split by category), sapply (to call function), and do.call to bind all groups back together:
df_list <- by(data, data$category, function(sub) {
sub[-1] <- sapply(sub[-1], replace_outliers)
sub
})
final_df <- do.call(rbind, unname(df_list))

Return two objects from lapply

I have created a function which takes a little while to run (lots of crunching going on) and there are two distinct outputs that I need to return from this function. The inputs into these outputs are the same which is why I have combined them in the same function so that I don't have to crunch them twice, but the outputs are so entirely different in content and based on such entirely different calculations that there is no way to actually combine them into a one parse kinda statement. One object is tens of lines earlier than the other. But I need to return both, so I think it has to be in some type of format which mimics: store the two separate objects in a single list, lapply, then extract and rbind the two objects.
Any help on a solution to this would be appreciated - ideally not using a for loop or data.table. Dplyr solutions are fine.
Some dummy data:
df <- data.frame(ID = c(rep("A",10), rep("B", 10), rep("C", 10)),
subID = c(rep("U", 5),rep("V", 5),rep("W", 5),rep("X", 5),rep("Y", 5),rep("Z", 5)),
Val = c(1,6,3,8,6,5,2,4,7,20,4,2,3,5,7,3,2,5,7,12,5,3,7,1,6,1,34,9,5,3))
The function (again noting the function is much more complex than this, and I am calculating many more complex and unrelated things in each of the separate objects, not just the average!):
func <- function(x, df){
temp <- filter(df, ID == x)
average_id <- temp %>% group_by(ID) %>% summarise(avg = mean(Val))
average_subid <- temp %>% group_by(ID, subID) %>% summarise(avg = mean(Val))
df_list <- list(avgID=average_id, avgSubID=average_subid)
return(df_list)
}
Presently I have computed the results using this command, but am unsure whether this is correct or how to further extract the results after the objects are stored in this list (of lists) (i.e. I get stuck here):
result <- lapply(list("A","B","C"), func, df)
The result should look like:
> average_ID
ID avg
1 A 6.2
2 B 5.0
3 C 7.4
> average_subID
ID subID avg
1 A U 4.8
2 A V 7.6
3 B W 4.2
4 B X 5.8
5 C Y 4.4
6 C Z 10.4
I have previously used a for loop and stored the results in lists (i.e. avgListID[x] <- average_id, then binded together. But I don't think this is ideal.
Thanks in advance!
I realize this is a bit old, but since neither provided answer seems to have done the trick, how about this? Split the function into two, and run each within your lapply, returning a list of lists?
library(dplyr)
df <- data.frame(ID = c(rep("A",10), rep("B", 10), rep("C", 10)),
subID = c(rep("U", 5),rep("V", 5),rep("W", 5),rep("X", 5),rep("Y", 5),rep("Z", 5)),
Val = c(1,6,3,8,6,5,2,4,7,20,4,2,3,5,7,3,2,5,7,12,5,3,7,1,6,1,34,9,5,3))
subfunc1 <- function(temp){
return(temp %>% group_by(ID) %>% summarise(avg = mean(Val)))
}
subfunc2 <- function(temp){
return(temp %>% group_by(ID, subID) %>% summarise(avg = mean(Val)))
}
func <- function(x, df){
temp <- filter(df, ID == x)
df_list <- list(avgID=subfunc1(temp), avgSubID=subfunc2(temp))
return(df_list)
}
result <- lapply(list("A","B","C"), func, df)
To get the structure/order you need, transpose the lists as explained here:
n <- length(result[[1]]) # assuming all lists in result have the same length
result <- lapply(1:n, function(i) lapply(result, "[[", i))
> average_ID <- aggregate(df$Val, by = list(df$ID), FUN = mean)
>
> average_ID
Group.1 x
1 A 6.2
2 B 5.0
3 C 7.4
> average_subID <- aggregate(df$Val, by = list(df$ID,df$subID), FUN = mean)
>
> average_subID
Group.1 Group.2 x
1 A U 4.8
2 A V 7.6
3 B W 4.2
4 B X 5.8
5 C Y 4.4
6 C Z 10.4
What about returning a list where each element represents the averages at a specific grouping level. For example:
library(tidyverse)
fnc = function(groups=NULL, data=df) {
groups=as.list(groups)
data %>%
group_by_(.dots=groups) %>%
summarise(avg=mean(Val))
}
list(Avg_Overall=NULL, Avg_by_ID="ID", Avg_by_SubID=c("ID","subID")) %>%
map(~fnc(.x))
$Avg_Overall
# A tibble: 1 x 1
avg
<dbl>
1 6.2
$Avg_by_ID
# A tibble: 3 x 2
ID avg
<fctr> <dbl>
1 A 6.2
2 B 5.0
3 C 7.4
$Avg_by_SubID
# A tibble: 6 x 3
# Groups: ID [?]
ID subID avg
<fctr> <fctr> <dbl>
1 A U 4.8
2 A V 7.6
3 B W 4.2
4 B X 5.8
5 C Y 4.4
6 C Z 10.4
You could also just calculate the average by subID and then the average by ID can be calculated from that:
# Average by subID
avg = df %>% group_by(ID, subID) %>%
summarise(n = n(),
avg = mean(Val))
# Average by ID
avg %>%
group_by(ID) %>%
summarise(avg = sum(avg*n)/sum(n))
# Overall average
avg %>%
ungroup %>%
summarise(avg = sum(avg*n)/sum(n))

Resources