R rowsums if colnames match two arguments in a second attribute table - r

I want to calculate rowsums only if colnames (i.e. species) of my data frame match two arguments in a second attribute table. This means it shoul first match the name in a column of the attributes table AND have a certain entry in another column of the attribute table.
However, the attribute table contains more species than the orginal data frame.
I tried :
# Species data from vegan package:
data(varespec, package = "vegan")
# create attributes table
attributes <- matrix(NA, length(varespec), 2)
attributes[,1] <- colnames(varespec)
attributes[,2] <- c(rep("MI",14),rep("PI",30))
# add species to the attribute table
x <- c("spec1","MI")
y <- c("spec2","PI")
attributes <- rbind(attributes, x, y)
row.names(attributes) <- c(1:46)
# calculate rowsums only for species contained in the attributes table
# and having the entry "MI" in the attributes table
for (i in 1:44){
for (j in 1:46){
if ((colnames(varespec)[i] == attributes[j,1]) & (attributes[j,2] == "MI")) {
apply(varespec,1,sum)
}
}}
But it always summed up the whole rows and not only the MI - species.

This is easy to solve if you convert the dataset into a long format
library(dplyr)
library(tidyr)
data(varespec, package = "vegan")
attributes <- data.frame(
Species = c(colnames(varespec), "spec1", "spec2"),
Attribute = c(rep(c("MI", "PI"), c(14, 30)), "MI", "PI")
)
varespec %>%
add_rownames("ID") %>%
gather(Species, Value, -ID) %>% #convert to long format
inner_join(attributes, by = "Species") %>%
filter(Attribute == "MI") %>%
group_by(ID) %>%
summarise(Total = sum(Value))

Related

sam_data gets transformed into NAs and numerical values when using merge_samples phyloseq

I have sam_data looking like this
I want to use the merge_samples() command on the "genus".
I have tried with code looking like this:
merge_physeq <- merge_samples(physeq, "genus")
What happens now is I get several of the warnings:
In asMethod(object) : NAs introduced by coercion
And my sam_data ends up looking like this:
I think this is due to nature of the variable. the different genuses are factor variables. But all my other data is also changed. The lower taxonomic levels are not important after the merge so they can be removed.
This is because you have not specified an appropriate fun argument to the merge_samples function. By default it calculates the mean, which is possible for numerical, boolean and factor types, but not for strings.
Unfortunately, it seems that phyloseq has hardcoded coercion to numeric for all columns in the sample data within the merge_samples function before applying the specified function, so I'm not sure if it is possible at all to deal with character type columns through merge_samples.
As a solution, try the merge_ps_samples function below, which should merge samples and then summarize any column according to its class. For numeric columns it calculates the mean within each group, for character types it pastes together the unique values within each group. You can of course choose any function you like, just alter the code to your desire.
Hope this helps! Good luck.
EDIT: note that this function calculates mean abundances, rather than summed abundances as implemented in the phyloseq::merge_samples function.
require(phyloseq)
require(tidyverse)
# Concatenate unique values in a vector
concat_unique <- function(vec){
uniq <- unique(as.character(vec))
return(paste(uniq, collapse = "/"))
}
# Like psmelt, but only uses the otu_table and sample_data
ps_semi_melt <- function(ps){
otu_table(ps) %>%
data.frame(taxid = row.names(.)) %>%
rename_with(function(x){gsub("X", "", x)}) %>%
pivot_longer(!taxid, names_to = "sample_id", values_to = "abundance") %>%
left_join(sample_data(ps) %>%
data.frame(sample_id = row.names(.)),
by = "sample_id")
}
# Function that summarizes a vector based on its class
summarise_vec <- function(vec){
if(class(vec) %in% c("numeric", "integer", "logical")){
return(mean(vec, na.rm = T))
} else if (class(vec) %in% c("factor", "character")){
return(concat_unique(vec))
} else {
stop("Error: unknown column type")
}
}
# Converts a summary df to an otu_table
summ_to_otu_tbl <- function(summ){
summ %>%
select(taxid, sample_id, abundance) %>%
pivot_wider(names_from = "sample_id", values_from = "abundance") %>%
column_to_rownames('taxid') %>%
as.matrix() %>%
otu_table(, taxa_are_rows = TRUE)
}
# Converts a summary df to sample_data
summ_to_sample_dat <- function(summ){
summ %>%
select(!c(taxid, abundance)) %>%
unique() %>%
column_to_rownames('sample_id') %>%
sample_data()
}
# Function that merges phyloseq samples based on the names of one or more grouping factors
# present in sample_data(ps)
merge_ps_samples <- function(ps, grouping){
# Make sure taxa are rows
if (!phyloseq::taxa_are_rows(ps)) {
otu_table(ps) <- phyloseq::otu_table(t(otu_table(ps)), taxa_are_rows = T)
}
# Convert to long format
ps_long <- ps_semi_melt(ps)
# Summarise all columns
summ <- ps_long %>%
group_by(across(all_of(!!grouping))) %>%
group_by(taxid, .add = T) %>%
summarise(across(everything(), summarise_vec)) %>%
ungroup()
# Convert to otu_table and sample_data
otu_tbl <- summ_to_otu_tbl(summ)
sample_dat <- summ_to_sample_dat(summ)
# Create new physeq object
new_ps <- phyloseq(otu_tbl, sample_dat, tax_table(ps))
return(new_ps)
}
data("GlobalPatterns")
ps <- GlobalPatterns
merged_ps <- merge_ps_samples(ps, grouping = "SampleType")

Convert data frame columns to vectors

I have a dataframe named "Continents_tmap" where I want to return 3 vectors as per the following examples. Note:the "Values" needs to be Cases as per name in dataframe
labels = c("France","Germany","India", etc)
Parent = c("Europe","Europe","Asia",etc)
Values = c(100,345,456,etc)
My current code is as follows.
covid_1 <-
read.csv("C:/Users/Owner/Downloads/COVID-19 Activity.csv", stringsAsFactors = FALSE)
df1 <-
select(
covid_1,
REPORT_DATE,
COUNTRY_SHORT_NAME,
COUNTRY_ALPHA_3_CODE,
PEOPLE_DEATH_NEW_COUNT,
PEOPLE_POSITIVE_NEW_CASES_COUNT,
PEOPLE_DEATH_COUNT,
PEOPLE_POSITIVE_CASES_COUNT,
CONTINENT_NAME
)
Continents_tmap <- df1 %>%
group_by(Continent,Country.x) %>%
summarise(Deaths = sum(Deaths), Cases = sum(Positive_Cases))
Continents_tmap<- data.frame(Continents_tmap)

Lookup tables in R

I have a tibble with a ton of data in it, but most importantly, I have a column that references a row in a lookup table by number (ex. 1,2,3 etc).
df <- tibble(ref = c(1,1,1,2,5)
data = c(33,34,35,35,32))
lkup <- tibble(CurveID <- c(1,2,3,4,5)
Slope <- c(-3.8,-3.5,-3.1,-3.3,-3.3)
Intercept <- c(40,38,40,38,36)
Min <- c(25,25,21,21,18)
Max <- c(36,36,38,37,32))
I need to do a calculation for each row in the original tibble based on the information in the referenced row in the lookup table.
df$result <- df$data - lkup$intercept[lkup$CurveID == df$ref]/lkup$slope[lkup$CurveID == df$ref]
The idea is to access the slope or intercept (etc) value from the correct row of the lookup table based on the number in the data table, and to do this for each data point in the column. But I keep getting an error telling me my data isn't compatible, and that my objects need to be of the same length.
You could also do it with match()
df$result <- df$data - lkup$Intercept[match(df$ref, lkup$CurveID)]/lkup$Slope[match(df$ref, lkup$CurveID)]
df$result
# [1] 43.52632 44.52632 45.52632 45.85714 42.90909
You could use the dplyr package to join the tibbles together. If the ref column and CurveID column have the same name then left_join will combine the two tibbles by the matching rows.
library(dplyr)
df <- tibble(CurveID = c(1,1,1,2,5),
data = c(33,34,35,35,32))
lkup <- tibble(CurveID = c(1,2,3,4,5),
Slope = c(-3.8,-3.5,-3.1,-3.3,-3.3),
Intercept = c(40,38,40,38,36),
Min = c(25,25,21,21,18),
Max = c(36,36,38,37,32))
df <- df %>% left_join(lkup, by = "CurveID")
Then do the calcuation on each row
df <- df %>% mutate(result = data - (Intercept/Slope)) %>%
select(CurveID, data, result)
For completeness' sake, here's one way to literally do what OP was trying:
library(slider)
df %>%
mutate(result = slide_dbl(ref, ~ slice(lkup, .x)$Intercept /
slice(lkup, .x)$Slope))
though since slice goes by row number, this relies on CurveID equalling the row number (we make no reference to CurveID at all). You can write it differently with filter but it ends up being more code.

Mutate ifelse on a vector

Let's say I have this data frame:
set.seed(2)
df <- iris[c(1:5,51:55,101:105),]
df_long <- gather(df, key = "flower_att", value = "measurement",
Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
df_long$setosa_sub <-sample(5,size = 60, replace = TRUE)
df_long$versicolor_sub <-sample(5,size = 60, replace = TRUE)
df_long$virginica_sub <-sample(5,size = 60, replace = TRUE)
df_long$sub_q<-0
Now I want to copy a value to sub_q variable based on Species variable and sub values.
I know how to do it one by one:
df_long2 <- df_long %>%
mutate(sub_q =ifelse(Species =="setosa", setosa_sub,sub_q)) %>%
mutate(sub_q =ifelse(Species =="versicolor", versicolor_sub,sub_q)) %>%
mutate(sub_q =ifelse(Species =="virginica", virginica_sub,sub_q))
But I can't figure out what is the right way to apply on a vector of the Species values instead.
species_vector <- c("setosa","versicolor","virginica")
I'm actually not sure if I need to make new function or just loop it somehow. Hope it's make sense...
I don't see anything wrong with the way you are doing it. Another way, using an apply function (sapply in this case) would work like this:
# a helper function to find the right value for the xth row
get_correct_sub <- function(x){
col_name = paste0(df_long$Species[x],'_sub')
df_long[[ col_name ]][x] }
# apply each row index to the helper function
df_long2 = df_long
df_long2$sub_q = sapply(1:nrow(df_long), get_correct_sub)
The helper function adds "_sub" to the species name, treats that as a column name, and then gets the value for that column.
Here is a datastep() solution. I created a vector lookup to map the Species to the desired column, then step through the data row by row and assign the value using the lookup. data is the input dataset and n. is the current row number:
library(libr)
# Create vector lookup
species_vector <- c("setosa" = "setosa_sub", "versicolor" = "versicolor_sub", "virginica" = "virginica_sub")
# Step through data row by row, and assign value using lookup
df_long2 <- df_long %>%
datastep({
sub_q <- data[n., species_vector[Species]]
})

Log Transform many variables in R with loop

I have a data frame that has a binary variable for diagnosis (column 1) and 165 nutrient variables (columns 2-166) for n=237. Let’s call this dataset nutr_all. I need to create 165 new variables that take the natural log of each of the nutrient variables. So, I want to end up with a data frame that has 331 columns - column 1 = diagnosis, cols 2-166 = nutrient variables, cols 167-331 = log transformed nutrient variables. I would like these variables to take the name of the old variables but with "_log" at the end
I have tried using a for loop and the mutate command, but, I'm not very well versed in r, so, I am struggling quite a bit.
for (nutr in (nutr_all_nomiss[,2:166])){
nutr_all_log <- mutate(nutr_all, nutr_log = log(nutr) )
}
When I do this, it just creates a single new variable called nutr_log. I know I need to let r know that the "nutr" in "nutr_log" is the variable name in the for loop, but I'm not sure how.
For any encountering this page more recently, dplyr::across() was introduced in late 2020 and it is built for exactly this task - applying the same transformation to many columns all at once.
A simple solution is below.
If you need to be selective about which columns you want to transform, check out the tidyselect helper functions by running ?tidyr_tidy_select in the R console.
library(tidyverse)
# create vector of column names
variable_names <- paste0("nutrient_variable_", 1:165)
# create random data for example
data_values <- purrr::rerun(.n = 165,
sample(x=100,
size=237,
replace = T))
# set names of the columns, coerce to a tibble,
# and add the diagnosis column
nutr_all <- data_values %>%
set_names(variable_names) %>%
as_tibble() %>%
mutate(diagnosis = 1:237) %>%
relocate(diagnosis, .before = everything())
# use across to perform same transformation on all columns
# whose names contain the phrase 'nutrient_variable'
nutr_all_with_logs <- nutr_all %>%
mutate(across(
.cols = contains('nutrient_variable'),
.fns = list(log10 = log10),
.names = "{.col}_{.fn}"))
# print out a small sample of data to validate
nutr_all_with_logs[1:5, c(1, 2:3, 166:168)]
Personally, instead of adding all the columns to the data frame,
I would prefer to make a new data frame that contains only the
transformed values, and change the column names:
logs_only <- nutr_all %>%
mutate(across(
.cols = contains('nutrient_variable'),
.fns = log10)) %>%
rename_with(.cols = contains('nutrient_variable'),
.fn = ~paste0(., '_log10'))
logs_only[1:5, 1:3]
We can use mutate_at
library(dplyr)
nutr_all_log <- nutr_all_nomiss %>%
mutate_at(2:166, list(nutr_log = ~ log(.)))
In base R, we can do this directly on the data.frame
nm1 <- paste0(names(nutr_all_nomiss)[2:166], "_nutr_log")
nutr_all_nomiss[nm1] <- log(nutr_all_nomiss[nm1])
In base R, we can use lapply :
nutr_all_nomiss[paste0(names(nutr_all_nomiss)[2:166], "_log")] <- lapply(nutr_all_nomiss[2:166], log)
Here is a solution using only base R:
First I will create a dataset equivalent to yours:
nutr_all <- data.frame(
diagnosis = sample(c(0, 1), size = 237, replace = TRUE)
)
for(i in 2:166){
nutr_all[i] <- runif(n = 237, 1, 10)
names(nutr_all)[i] <- paste0("nutrient_", i-1)
}
Now let's create the new variables and append them to the data frame:
nutr_all_log <- cbind(nutr_all, log(nutr_all[, -1]))
And this takes care of the names:
names(nutr_all_log)[167:331] <- paste0(names(nutr_all[-1]), "_log")
given function using dplyr will do your task, which can be used to get log transformation for all variables in the dataset, it also checks if the column has -ive values. currently, in this function it will not calculate the log for those parameters,
logTransformation<- function(ds)
{
# this function creats log transformation of dataframe for only varibles which are positive in nature
# args:
# ds : Dataset
require(dplyr)
if(!class(ds)=="data.frame" ) { stop("ds must be a data frame")}
ds <- ds %>%
dplyr::select_if(is.numeric)
# to get only postive variables
varList<- names(ds)[sapply(ds, function(x) min(x,na.rm = T))>0]
ds<- ds %>%
dplyr::select(all_of(varList)) %>%
dplyr::mutate_at(
setNames(varList, paste0(varList,"_log")), log)
)
return(ds)
}
you can use it for your case as :
#assuming your binary variable has namebinaryVar
nutr_allTransformed<- nutr_all %>% dplyr::select(-binaryVar) %>% logTransformation()
if you want to have negative variables too, replace varlist as below:
varList<- names(ds)

Resources