my dataframe:
Name
Value
Setosa
1
Versicolor
2
So first of all, I want to check if an input matches the name in any row.
My solution for the filter so far:
# input$df <- Versicolor
import dplyr
df_table <- df_table %>%
dplyr::filter(grepl(input$df, Name, ignore.case = TRUE))
If there is a match, I'd like to update/overwrite this row with new values, like in the following table:
Name
Value
Setosa
1
Versicolor
3
The name stays the same, but only the value changes.
Do you have any advice?
You can try the following:
df_table[df_table$Name == input$df, 'Value'] <- new_value
This will update the Value column for all rows where the value in Name is the same as input$df which in your example is Versicolor
We can use a join
library(dplyr)
df_table %>%
left_join(input, by = c("Name")) %>%
mutate(Value = coalesce(Value.x, Value.x), .keep = "unused")
Or with data.table
library(data.table)
setDT(df_table)[input, Value := i.Value, on = .(Name)]
We can use {powerjoin} :
library(powerjoin)
df1 <- data.frame(Name = c("Setosa", "Versicolor"), Value = c(1, 2))
df2 <- data.frame(Name = "Versicolor", Value = 3)
power_left_join(df1, df2, by = "Name", conflict = coalesce_yx)
#> Name Value
#> 1 Setosa 1
#> 2 Versicolor 3
# or doing a row-wise sum:
power_left_join(df1, df2, by = "Name", conflict = rw ~ sum(.x, .y, na.rm = TRUE))
#> Name Value
#> 1 Setosa 1
#> 2 Versicolor 5
Created on 2022-04-14 by the reprex package (v2.0.1)
Related
Say I have a data frame:
df <- data.frame(a = 1:10,
b = 1:10,
c = 1:10)
I'd like to apply several summary functions to each column, so I use dplyr::summarise_all
library(dplyr)
df %>% summarise_all(.funs = c(mean, sum))
# a_fn1 b_fn1 c_fn1 a_fn2 b_fn2 c_fn2
# 1 5.5 5.5 5.5 55 55 55
This works great! Now, say I have a function that takes an extra parameter. For example, this function calculates the number of elements in a column above a threshold. (Note: this is a toy example and not the real function.)
n_above_threshold <- function(x, threshold) sum(x > threshold)
So, the function works like this:
n_above_threshold(1:10, 5)
#[1] 5
I can apply it to all columns like before, but this time passing the additional parameter, like so:
df %>% summarise_all(.funs = c(mean, n_above_threshold), threshold = 5)
# a_fn1 b_fn1 c_fn1 a_fn2 b_fn2 c_fn2
# 1 5.5 5.5 5.5 5 5 5
But, say I have a vector of thresholds where each element corresponds to a column. Say, c(1, 5, 7) for my example above. Of course, I can't simply do this, as it doesn't make any sense:
df %>% summarise_all(.funs = c(mean, n_above_threshold), threshold = c(1, 5, 7))
If I was using base R, I might do this:
> mapply(n_above_threshold, df, c(1, 5, 7))
# a b c
# 9 5 3
Is there a way of getting this result as part of a dplyr piped workflow like I was using for the simpler cases?
dplyr provides a bunch of context-dependent functions. One is cur_column(). You can use it in summarise to look up the threshold for a given column.
library("tidyverse")
df <- data.frame(
a = 1:10,
b = 1:10,
c = 1:10
)
n_above_threshold <- function(x, threshold) sum(x > threshold)
# Pair the parameters with the columns
thresholds <- c(1, 5, 7)
names(thresholds) <- colnames(df)
df %>%
summarise(
across(
everything(),
# Use `cur_column()` to access each column name in turn
list(count = ~ n_above_threshold(.x, thresholds[cur_column()]),
mean = mean)
)
)
#> a_count a_mean b_count b_mean c_count c_mean
#> 1 9 5.5 5 5.5 3 5.5
This returns NA silently if the current column name doesn't have a known threshold. This is something that you might or might not want to happen.
df %>%
# Add extra column to show what happens if we don't know the threshold for a column
mutate(
x = 1:10
) %>%
summarise(
across(
everything(),
# Use `cur_column()` to access each column name in turn
list(count = ~ n_above_threshold(.x, thresholds[cur_column()]),
mean = mean)
)
)
#> a_count a_mean b_count b_mean c_count c_mean x_count x_mean
#> 1 9 5.5 5 5.5 3 5.5 NA 5.5
Created on 2022-03-11 by the reprex package (v2.0.1)
I have this type of data:
df <- data.frame(
Utt = c(rep("oh", 10), rep("ah", 10)),
name = rep(LETTERS[1:2], 10),
value = c(0.5,2,2,2,2,1,0,1,3.5,1,
2.2,2.3,1.9,0.1,0.3,1.8,3,4,3.5,2)
)
I need to know whether within in each group of Utt and name, there are continuous value increases and how large these increases are.
EDIT: I've cobbled together this code, which produces the right result but seems convoluted:
df %>%
# order by name:
arrange(name) %>%
group_by(name, Utt) %>%
# mutate:
mutate(
# is there an increase from one value to the next?
is_increase = ifelse(lag(value) < value, value, NA),
# what's the difference between these values?
diff = is_increase - lag(value)) %>%
group_by(name, Utt, grp = rleid(!is.na(diff))) %>%
# sum the contiguous values:
summarise(increase_size = sum(diff, na.rm = TRUE)) %>%
# remove 0 values:
filter(!increase_size == 0) %>%
# put same-group increase_sizes in the same row:
summarise(
increase_size = str_c(increase_size, collapse = ', '))
# A tibble: 3 x 3
# Groups: name [2]
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
NOTE: Ideally, the expected outcome would be:
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA
Is there a better (i.e., more concise, more clever) dplyr solution?
Use this function to find what you want.
f <- function(x) {
ind <- which(x > lag(x))
if (length(ind) == 0) {
return(NA)
}
ind2 <- ind[which(lead(ind, default = max(ind) + 2) - ind > 1)]
ind1 <- ind[which(ind - lag(ind, default = min(ind) - 2) > 1)] - 1
return(paste0(x[ind2] - x[ind1], collapse = ", "))
}
And use the function in summarise:
df %>% group_by(name, Utt) %>% summarise(increase = f(value))
Using tidyverse, my solution was similar to yours. One possible modification might be to subset your columns before summing instead of filtering. This will keep all combinations of name and Utt and allow for NA for increase_size in the end. Since the column increase_size is character type, you can convert an empty string to NA.
library(data.table)
library(tidyverse)
df %>%
arrange(name) %>%
group_by(name, Utt) %>%
mutate(diff = c(0, diff(value))) %>%
group_by(grp = rleid(diff < 0), .add = T) %>%
summarise(increase_size = sum(diff[diff > 0], na.rm = T)) %>%
group_by(name, Utt) %>%
summarise(increase_size = toString(increase_size[increase_size > 0])) %>%
mutate(increase_size = na_if(increase_size, ""))
Output
name Utt increase_size
<chr> <chr> <chr>
1 A ah 3.2
2 A oh 1.5, 3.5
3 B ah 3.9
4 B oh NA
I need to perform calculation based on inputs defined in a dataframe. Refer the dataframe RefDf below. It has 3 columns - column name, calculation, New Variable Name. When Calculation column contains count, we should use n_distinct( ) function.
RefDf <- read.table(text = "Variables Calculation NewVariable
Sepal.Length sum Sepal.Length2
Petal.Length count Petal.LengthNew
", header = T)
Manual Approach - Needs to be automated via inputs in RefDf. Species remains same for grouping.
library(dplyr)
iris %>% group_by_at("Species") %>%
summarise(Sepal.Length2 = sum(Sepal.Length,na.rm = T),
Petal.LengthNew = n_distinct(Petal.Length, na.rm = T)
)
I am looking for dplyr or base R based solution
Here's a solution with data.table package
library(data.table)
library(dplyr)
# using data.table
dt <- as.data.table(RefDf)
dt[Calculation == "count", Calculation := "n_distinct"]
# function for doing grouping calculation
inner.fun <- function(calc, data, column, group="Species"){
print(column)
data.dt <- as.data.table(data)
data.dt[, .(as.numeric(get(calc)(get(column)))), by=group][]
}
out <- dt[, inner.fun(calc=Calculation, data=iris, column=Variables), by=NewVariable]
# reshape from wide to long
out2 <- dcast(data=out, Species ~ NewVariable, value.var="V1")
# convert to data.frame
out_df <- as.data.frame(out2)
out_df
Species Petal.LengthNew Sepal.Length2
1 setosa 9 250.3
2 versicolor 19 296.8
3 virginica 20 329.4
I want to use a custom function and return columns with an added "_cat_mean" to each column.
In the code below "$cat_mean" is added and I can't select it by that name.
summarise_categories <- function(x) {
tibble(
cat_mean = round(mean(x) * 2) / 2
)
}
iris_summarised = iris %>%
group_by(Species) %>%
summarise(across(ends_with("Length"), ~summarise_categories(.)))
Select columns by the name which is displayed doesn't work
iris_summarised %>%
select(Species, Sepal.Length$cat_mean)
But this works
iris_summarised %>%
select(Species, Sepal.Length)
I want the column to be named "Sepal.Length_cat_mean"
You can use .names argument in across to give new column names.
library(dplyr)
summarise_categories <- function(x) {
round(mean(x) * 2) / 2
}
iris %>%
group_by(Species) %>%
summarise(across(ends_with("Length"), summarise_categories,
.names = '{col}_cat_mean')) -> iris_summarised
iris_summarised
# Species Sepal.Length_cat_mean Petal.Length_cat_mean
# <fct> <dbl> <dbl>
#1 setosa 5 1.5
#2 versicolor 6 4.5
#3 virginica 6.5 5.5
Using base R with colMeans and by
by(iris[-5], iris$Species, function(x) round(colMeans(x) * 2) /2)
I need to run very similar codes for 3 different dataset. My current codes look like this:
## data a
a_dat2 <- merge(a_dat, zip, by = "zip", all.x = T)
a_dat2 <- a_dat2 %>%
group_by(zip) %>%
summarize(dist_a_min = min(dist))
## data b
b_dat2 <- merge(b_dat, zip, by = "zip", all.x = T)
b_dat2 <- b_dat2 %>%
group_by(zip) %>%
summarize(dist_b_min = min(dist))
## data c
c_dat2 <- merge(c_dat, zip, by = "zip", all.x = T)
c_dat2 <- c_dat2 %>%
group_by(zip) %>%
summarize(dist_c_min = min(dist))
The codes for the 3 dataset are same except that the name of the data varies: a_dat, b_dat, c_dat. The variable name dist varies too: dist_a_min, dist_b_min, dist_c_min. What function/loop can be used to shorten the codes so that I don't need to copy and paste for each dataset separately?
An option would be to place the elements in a list with mget, loop through the list with imap, join (?left_join) with 'zip' dataset, grouped by 'zip' and get the min of 'dist' while creating the column name based on the identifier name substring
library(tidyverse)
mget(ls(pattern = "_dat2$")) %>%
imap(~ left_join(.x, zip, by = 'zip') %>%
group_by(zip) %>%
summarise((! str_c('dist_', substr(.y, 1, 1), '_min') := min(dist)))
Or another option is to create a function for repeated tasks
joinSumm <- function(dat, groupName, colName, data2) {
groupName <- enquo(groupName)
colName <- enquo(colName)
nm1 <- str_c('dist_', str_sub(rlang::as_name(enquo(dat)), 1, 1), '_min')
dat %>%
left_join(data2, by = rlang::as_name(groupName)) %>%
group_by(!! groupName) %>%
summarise((!! nm1) := min(!! colName))
}
joinSumm(a_dat2, zip, dist, zip)
joinSumm(b_dat2, zip, dist, zip)
A reproducible example with built-in dataset iris (without the join part)
list(a_dat = iris, b_dat = iris, c_dat = iris) %>%
imap(~ .x %>%
group_by(Species) %>%
summarise(!! str_c('dist_', substr(.y, 1, 1), '_min') := min(Sepal.Length)))
#$a_dat
# A tibble: 3 x 2
# Species dist_a_min
# <fct> <dbl>
#1 setosa 4.3
#2 versicolor 4.9
#3 virginica 4.9
#$b_dat
# A tibble: 3 x 2
# Species dist_b_min
# <fct> <dbl>
#1 setosa 4.3
#2 versicolor 4.9
#3 virginica 4.9
$c_dat
# A tibble: 3 x 2
# Species dist_c_min
# <fct> <dbl>
#1 setosa 4.3
#2 versicolor 4.9
#3 virginica 4.9