Mutate ifelse on a vector - r

Let's say I have this data frame:
set.seed(2)
df <- iris[c(1:5,51:55,101:105),]
df_long <- gather(df, key = "flower_att", value = "measurement",
Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
df_long$setosa_sub <-sample(5,size = 60, replace = TRUE)
df_long$versicolor_sub <-sample(5,size = 60, replace = TRUE)
df_long$virginica_sub <-sample(5,size = 60, replace = TRUE)
df_long$sub_q<-0
Now I want to copy a value to sub_q variable based on Species variable and sub values.
I know how to do it one by one:
df_long2 <- df_long %>%
mutate(sub_q =ifelse(Species =="setosa", setosa_sub,sub_q)) %>%
mutate(sub_q =ifelse(Species =="versicolor", versicolor_sub,sub_q)) %>%
mutate(sub_q =ifelse(Species =="virginica", virginica_sub,sub_q))
But I can't figure out what is the right way to apply on a vector of the Species values instead.
species_vector <- c("setosa","versicolor","virginica")
I'm actually not sure if I need to make new function or just loop it somehow. Hope it's make sense...

I don't see anything wrong with the way you are doing it. Another way, using an apply function (sapply in this case) would work like this:
# a helper function to find the right value for the xth row
get_correct_sub <- function(x){
col_name = paste0(df_long$Species[x],'_sub')
df_long[[ col_name ]][x] }
# apply each row index to the helper function
df_long2 = df_long
df_long2$sub_q = sapply(1:nrow(df_long), get_correct_sub)
The helper function adds "_sub" to the species name, treats that as a column name, and then gets the value for that column.

Here is a datastep() solution. I created a vector lookup to map the Species to the desired column, then step through the data row by row and assign the value using the lookup. data is the input dataset and n. is the current row number:
library(libr)
# Create vector lookup
species_vector <- c("setosa" = "setosa_sub", "versicolor" = "versicolor_sub", "virginica" = "virginica_sub")
# Step through data row by row, and assign value using lookup
df_long2 <- df_long %>%
datastep({
sub_q <- data[n., species_vector[Species]]
})

Related

Rename column within function in r using dplyr

In this case, I have a loop that triggers a function, which in turn triggers a function that collects the data.
One weird thing, is I cannot rename the columns in the dataset created - d. Bascially I need standardised names such that I can pass different variables, and as a result, I need to rename the columns during the dplyr transformation. The problem is here: %>% rename(Con = 1, DV = 2). In the dataset I have selected, I want to label the first column con, and the second column DV, such that I can pass this into the CollectDEffect function to run the cohensD analysis. All of this works when I run line by line, but I want to run the function by all the DVs and create a table, hence why I need to get this working within the loop.
# Function to run analyses and create the dataframe with output
CD_EE_DF <- data.frame("Test" = character())
CollectDEffect = function(cd, d){
excess <- data.frame("Test" = cd,
"Sample Size" = nrow(d),
"Original Cohen's d" = cohensD(d$DV ~ d$Con))
CD_EE_DF <- rbind(CD_EE_DF, excess)
return(CD_EE_DF)
}
# Data transformation, where the error is
CollectDEffect_Trigger = function(DVTest){
# Problem occurs here with the rename
d <- df %>% filter(Gender == "Female", Target_Gender != "") %>% select(Target_Gender, DVTest) %>% rename(Con = 1, DV = 2) %>% na.omit()
CD_EE_DF <- CollectDEffect(paste0("A_",DVTest),d)
}
# Loop that triggers all of the analyses
vec_dv <- c("Status", "warmth")
for (DVTest in vec_dv) {
CD_EE_DF <- CollectDEffect_Trigger(DVTest)
}

r.squared matrix of predictions vs actual values in R

I want to create a matrix that displays the r.squared coefficient of determination of some predictions made over the years and the actual values.
My goal is to display a matrix that looks something like this.
The only way I found is to make multiple lists, calculate each row/ column individually using map2_dbl(l.predicted_line1, l.actual, ~ summary(lm(.x ~ .y))$r.squared), and then add the resulting vectors in a matrix with some code. This would create 9 lists, which I want to avoid.
Is there any way of doing this in a more efficiently?
#sample data
l.actual <- list(
overall_15 = c(59,65,73,73,64,69,64,69,63,NA,82,60,NA,73,NA,73,73,NA,69,
69,71,66,65,70,72,72,NA,64,69,67,64,71,NA,62,62,71,67,63,64,76,72),
overall_16 = c(60,68,75,74,68,71,NA,72,64,69,82,66,64,77,NA,71,72,NA,69,
69,75,67,71,73,73,73,NA,66,NA,69,65,70,76,NA,67,71,72,64,65,76,73),
overall_17 = c(63,68,NA,74,72,72,NA,73,66,69,83,67,64,76,NA,71,73,NA,70,
70,79,NA,73,72,NA,NA,NA,NA,NA,70,NA,70,77,NA,68,74,74,66,64,75,69),
overall_18 = c(NA,68,NA,78,73,72,NA,72,68,67,86,NA,62,75,65,71,71,67,71,
71,76,NA,71,71,NA,NA,74,NA,71,NA,NA,68,74,NA,67,75,74,65,NA,72,NA),
overall_19 = c(NA,NA,NA,77,73,72,NA,71,69,66,87,63,62,73,65,NA,NA,NA,NA,
NA,75,NA,NA,67,NA,NA,73,NA,NA,NA,NA,NA,74,NA,NA,74,74,65,NA,68,NA),
overall_20 = c(NA,NA,NA,77,NA,NA,NA,72,71,66,87,NA,NA,NA,65,NA,NA,NA,70,
70,75,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,74,NA,66,71,73,NA,NA,69,NA),
overall_21 = c(NA,67,NA,76,NA,69,NA,73,69,65,85,NA,NA,NA,NA,NA,NA,NA,NA,
NA,75,NA,NA,NA,NA,NA,69,NA,NA,NA,NA,NA,73,NA,67,68,72,NA,NA,68,NA),
overall_22 = c(NA,NA,NA,75,NA,NA,NA,75,67,65,84,NA,NA,NA,NA,NA,NA,NA,68,
68,73,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,67,69,71,NA,NA,68,NA)
)
l.predicted <- list(
potential_15 = c(59,68,74,76,65,75,64,72,66,NA,85,60,NA,76,NA,73,75,NA,71,
71,71,67,65,70,72,72,NA,68,74,67,64,71,NA,62,62,71,71,63,67,78,72),
potential_16 = c(60,71,75,75,68,73,NA,74,66,69,83,66,64,77,NA,71,74,NA,70,
70,76,67,71,73,73,73,NA,66,NA,69,65,70,76,NA,67,71,72,64,66,76,73),
potential_17 = c(63,69,NA,75,72,72,NA,73,69,69,83,67,64,76,NA,71,73,NA,70,
70,79,NA,73,72,NA,NA,NA,NA,NA,70,NA,70,77,NA,68,74,74,66,64,75,69),
potential_18 = c(NA,68,NA,78,73,72,NA,72,69,67,86,NA,62,75,65,71,71,67,71,
71,76,NA,71,71,NA,NA,74,NA,71,NA,NA,68,74,NA,67,75,74,65,NA,72,NA),
potential_19 = c(NA,NA,NA,77,73,72,NA,71,70,66,87,63,62,73,65,NA,NA,NA,NA,
NA,75,NA,NA,67,NA,NA,73,NA,NA,NA,NA,NA,74,NA,NA,74,74,65,NA,68,NA),
potential_20 = c(NA,NA,NA,77,NA,NA,NA,72,71,66,87,NA,NA,NA,65,NA,NA,NA,70,
70,75,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,74,NA,66,71,73,NA,NA,69,NA),
potential_21 = c(NA,67,NA,76,NA,69,NA,73,69,65,85,NA,NA,NA,NA,NA,NA,NA,NA,
NA,75,NA,NA,NA,NA,NA,69,NA,NA,NA,NA,NA,73,NA,67,68,72,NA,NA,68,NA),
potential_22 = c(NA,NA,NA,75,NA,NA,NA,75,67,65,84,NA,NA,NA,NA,NA,NA,NA,68,
68,73,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,67,69,71,NA,NA,68,NA)
)
Here is a solution using some tidyverse packages. The key thing is to use the function expand_grid() to get all combinations of the elements of each list. This results in a tibble with two named list columns. Next we can use mutate() to pull out the names of the list and assign them to new columns, and extract the numeric IDs. Use filter() to retain only the rows where potential is less than or equal to overall. Finally get the R-squared for each row using your suggested code, and plot. (Note I did not try too hard to get the plot to look just like yours.)
library(purrr)
library(dplyr)
library(ggplot2)
library(tidyr)
r_squared_combinations <- expand_grid(l.actual, l.predicted) %>%
mutate(overall = names(l.actual),
potential = names(l.predicted),
overall_n = as.numeric(gsub('overall_', '', overall)),
potential_n = as.numeric(gsub('potential_', '', potential))) %>%
filter(potential_n <= overall_n) %>%
mutate(r_squared = map2_dbl(l.predicted, l.actual, ~ summary(lm(.x ~ .y))$r.squared))
ggplot(r_squared_combinations, aes(x = overall, y = potential, fill = r_squared, label = round(r_squared, 3))) +
geom_tile() +
geom_text(color = 'white')
Side note: incidentally the base function expand.grid() would work about as well as tidyr::expand_grid() but expand_grid() returns a tibble by default which may be more convenient if you are using tidyverse functions otherwise.

Lookup tables in R

I have a tibble with a ton of data in it, but most importantly, I have a column that references a row in a lookup table by number (ex. 1,2,3 etc).
df <- tibble(ref = c(1,1,1,2,5)
data = c(33,34,35,35,32))
lkup <- tibble(CurveID <- c(1,2,3,4,5)
Slope <- c(-3.8,-3.5,-3.1,-3.3,-3.3)
Intercept <- c(40,38,40,38,36)
Min <- c(25,25,21,21,18)
Max <- c(36,36,38,37,32))
I need to do a calculation for each row in the original tibble based on the information in the referenced row in the lookup table.
df$result <- df$data - lkup$intercept[lkup$CurveID == df$ref]/lkup$slope[lkup$CurveID == df$ref]
The idea is to access the slope or intercept (etc) value from the correct row of the lookup table based on the number in the data table, and to do this for each data point in the column. But I keep getting an error telling me my data isn't compatible, and that my objects need to be of the same length.
You could also do it with match()
df$result <- df$data - lkup$Intercept[match(df$ref, lkup$CurveID)]/lkup$Slope[match(df$ref, lkup$CurveID)]
df$result
# [1] 43.52632 44.52632 45.52632 45.85714 42.90909
You could use the dplyr package to join the tibbles together. If the ref column and CurveID column have the same name then left_join will combine the two tibbles by the matching rows.
library(dplyr)
df <- tibble(CurveID = c(1,1,1,2,5),
data = c(33,34,35,35,32))
lkup <- tibble(CurveID = c(1,2,3,4,5),
Slope = c(-3.8,-3.5,-3.1,-3.3,-3.3),
Intercept = c(40,38,40,38,36),
Min = c(25,25,21,21,18),
Max = c(36,36,38,37,32))
df <- df %>% left_join(lkup, by = "CurveID")
Then do the calcuation on each row
df <- df %>% mutate(result = data - (Intercept/Slope)) %>%
select(CurveID, data, result)
For completeness' sake, here's one way to literally do what OP was trying:
library(slider)
df %>%
mutate(result = slide_dbl(ref, ~ slice(lkup, .x)$Intercept /
slice(lkup, .x)$Slope))
though since slice goes by row number, this relies on CurveID equalling the row number (we make no reference to CurveID at all). You can write it differently with filter but it ends up being more code.

Log Transform many variables in R with loop

I have a data frame that has a binary variable for diagnosis (column 1) and 165 nutrient variables (columns 2-166) for n=237. Let’s call this dataset nutr_all. I need to create 165 new variables that take the natural log of each of the nutrient variables. So, I want to end up with a data frame that has 331 columns - column 1 = diagnosis, cols 2-166 = nutrient variables, cols 167-331 = log transformed nutrient variables. I would like these variables to take the name of the old variables but with "_log" at the end
I have tried using a for loop and the mutate command, but, I'm not very well versed in r, so, I am struggling quite a bit.
for (nutr in (nutr_all_nomiss[,2:166])){
nutr_all_log <- mutate(nutr_all, nutr_log = log(nutr) )
}
When I do this, it just creates a single new variable called nutr_log. I know I need to let r know that the "nutr" in "nutr_log" is the variable name in the for loop, but I'm not sure how.
For any encountering this page more recently, dplyr::across() was introduced in late 2020 and it is built for exactly this task - applying the same transformation to many columns all at once.
A simple solution is below.
If you need to be selective about which columns you want to transform, check out the tidyselect helper functions by running ?tidyr_tidy_select in the R console.
library(tidyverse)
# create vector of column names
variable_names <- paste0("nutrient_variable_", 1:165)
# create random data for example
data_values <- purrr::rerun(.n = 165,
sample(x=100,
size=237,
replace = T))
# set names of the columns, coerce to a tibble,
# and add the diagnosis column
nutr_all <- data_values %>%
set_names(variable_names) %>%
as_tibble() %>%
mutate(diagnosis = 1:237) %>%
relocate(diagnosis, .before = everything())
# use across to perform same transformation on all columns
# whose names contain the phrase 'nutrient_variable'
nutr_all_with_logs <- nutr_all %>%
mutate(across(
.cols = contains('nutrient_variable'),
.fns = list(log10 = log10),
.names = "{.col}_{.fn}"))
# print out a small sample of data to validate
nutr_all_with_logs[1:5, c(1, 2:3, 166:168)]
Personally, instead of adding all the columns to the data frame,
I would prefer to make a new data frame that contains only the
transformed values, and change the column names:
logs_only <- nutr_all %>%
mutate(across(
.cols = contains('nutrient_variable'),
.fns = log10)) %>%
rename_with(.cols = contains('nutrient_variable'),
.fn = ~paste0(., '_log10'))
logs_only[1:5, 1:3]
We can use mutate_at
library(dplyr)
nutr_all_log <- nutr_all_nomiss %>%
mutate_at(2:166, list(nutr_log = ~ log(.)))
In base R, we can do this directly on the data.frame
nm1 <- paste0(names(nutr_all_nomiss)[2:166], "_nutr_log")
nutr_all_nomiss[nm1] <- log(nutr_all_nomiss[nm1])
In base R, we can use lapply :
nutr_all_nomiss[paste0(names(nutr_all_nomiss)[2:166], "_log")] <- lapply(nutr_all_nomiss[2:166], log)
Here is a solution using only base R:
First I will create a dataset equivalent to yours:
nutr_all <- data.frame(
diagnosis = sample(c(0, 1), size = 237, replace = TRUE)
)
for(i in 2:166){
nutr_all[i] <- runif(n = 237, 1, 10)
names(nutr_all)[i] <- paste0("nutrient_", i-1)
}
Now let's create the new variables and append them to the data frame:
nutr_all_log <- cbind(nutr_all, log(nutr_all[, -1]))
And this takes care of the names:
names(nutr_all_log)[167:331] <- paste0(names(nutr_all[-1]), "_log")
given function using dplyr will do your task, which can be used to get log transformation for all variables in the dataset, it also checks if the column has -ive values. currently, in this function it will not calculate the log for those parameters,
logTransformation<- function(ds)
{
# this function creats log transformation of dataframe for only varibles which are positive in nature
# args:
# ds : Dataset
require(dplyr)
if(!class(ds)=="data.frame" ) { stop("ds must be a data frame")}
ds <- ds %>%
dplyr::select_if(is.numeric)
# to get only postive variables
varList<- names(ds)[sapply(ds, function(x) min(x,na.rm = T))>0]
ds<- ds %>%
dplyr::select(all_of(varList)) %>%
dplyr::mutate_at(
setNames(varList, paste0(varList,"_log")), log)
)
return(ds)
}
you can use it for your case as :
#assuming your binary variable has namebinaryVar
nutr_allTransformed<- nutr_all %>% dplyr::select(-binaryVar) %>% logTransformation()
if you want to have negative variables too, replace varlist as below:
varList<- names(ds)

R rowsums if colnames match two arguments in a second attribute table

I want to calculate rowsums only if colnames (i.e. species) of my data frame match two arguments in a second attribute table. This means it shoul first match the name in a column of the attributes table AND have a certain entry in another column of the attribute table.
However, the attribute table contains more species than the orginal data frame.
I tried :
# Species data from vegan package:
data(varespec, package = "vegan")
# create attributes table
attributes <- matrix(NA, length(varespec), 2)
attributes[,1] <- colnames(varespec)
attributes[,2] <- c(rep("MI",14),rep("PI",30))
# add species to the attribute table
x <- c("spec1","MI")
y <- c("spec2","PI")
attributes <- rbind(attributes, x, y)
row.names(attributes) <- c(1:46)
# calculate rowsums only for species contained in the attributes table
# and having the entry "MI" in the attributes table
for (i in 1:44){
for (j in 1:46){
if ((colnames(varespec)[i] == attributes[j,1]) & (attributes[j,2] == "MI")) {
apply(varespec,1,sum)
}
}}
But it always summed up the whole rows and not only the MI - species.
This is easy to solve if you convert the dataset into a long format
library(dplyr)
library(tidyr)
data(varespec, package = "vegan")
attributes <- data.frame(
Species = c(colnames(varespec), "spec1", "spec2"),
Attribute = c(rep(c("MI", "PI"), c(14, 30)), "MI", "PI")
)
varespec %>%
add_rownames("ID") %>%
gather(Species, Value, -ID) %>% #convert to long format
inner_join(attributes, by = "Species") %>%
filter(Attribute == "MI") %>%
group_by(ID) %>%
summarise(Total = sum(Value))

Resources