Automatically split function output (list) into component data.frames - r

I have a functions which yields 2 dataframes. As functions can only return one object, I combined these dataframes as a list. However, I need to work with both dataframes separately. Is there a way to automatically split the list into the component dataframes, or to write the function in a way that both objects are returned separately?
The function:
install.packages("plyr")
require(plyr)
fun.docmerge <- function(x, y, z, crit, typ, doc = checkmerge) {
mergedat <- paste(deparse(substitute(x)), "+",
deparse(substitute(y)), "=", z)
countdat <- nrow(x)
check_t1 <- data.frame(mergedat, countdat)
z1 <- join(x, y, by = crit, type = typ)
countdat <- nrow(z1)
check_t2 <- data.frame(mergedat, countdat)
doc <- rbind(doc, check_t1, check_t2)
t1<-list()
t1[["checkmerge"]]<-doc
t1[[z]]<-z1
return(t1)
}
This is the call to the function, saving the result list to the new object results.
results <- fun.docmerge(x = df1, y = df2, z = "df3", crit = c("id"), typ = "left")
In the following sample data to replicate the problem:
df1 <- structure(list(id = c("XXX1", "XXX2", "XXX3",
"XXX4"), tr.isincode = c("ISIN1", "ISIN2",
"ISIN3", "ISIN4")), .Names = c("id", "isin"
), row.names = c(NA, 4L), class = "data.frame")
df2 <- structure(list(id= c("XXX1", "XXX5"), wrong= c(1L,
1L)), .Names = c("id", "wrong"), row.names = 1:2, class = "data.frame")
checkmerge <- structure(list(mergedat = structure(integer(0), .Label = character(0), class = "factor"),
countdat = numeric(0)), .Names = c("mergedat", "countdat"
), row.names = integer(0), class = "data.frame")
In the example, a list with the dataframes df3 and checkmerge are returned. I would need both dataframes separately. I know that I could do it via manual assignment (e.g., checkmerge <- results$checkmerge) but I want to eliminate manual changes as much as possible and am therefore looking for an automated way.

Related

Estimate_richness for all phyla in phyloseq

Is there an easy way to get ASV richness for each Phylum for each Station using the estimate_richness function in phyloseq? Or is there another simple way of extracting the abundance data for each taxonomic rank and calculating richness that way?
So far I have just been subsetting individual Phyla of interest using for example:
ps.Prymnesiophyceae <- subset_taxa(ps, Phylum == "Prymnesiophyceae")
alpha_diversity<-estimate_richness(ps.Prymnesiophyceae,measure=c("Shannon","Observed"))
H<-alpha_diversity$Shannon
S1<-alpha_diversity$Observed
S<-log(S1)
evenness<-H/S
alpha<-cbind(Shannon=H,Richness=S1,Evenness=evenness,sample_data(Prymnesiophyceae))
But this is rather a pain when having to do it for e.g. the top 20 phyla.
EDIT:
suggestion by #GTM works well until last step. See comment + dput:
> dput(head(sample_names(ps.transect), n=2)) c("2-1-DCM_S21_L001_R1_001.fastq", "2-1-SA_S9_L001_R1_001.fastq" )
> dput(head(alpha, n=2)) structure(list(Observed = c(31, 25), Shannon = c(2.84184012598765,
2.53358345702604), taxon = c("Prymnesiophyceae", "Prymnesiophyceae" ), sample_id = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq" ), S = c(3.43398720448515,
3.2188758248682), evenness = c(0.827562817437384,
0.787101955736294)), row.names = c("X2.1.DCM_S21_L001_R1_001.fastq", "X2.1.SA_S9_L001_R1_001.fastq"), class = "data.frame")
> dput(head(smpl_data, n=1)) new("sample_data", .Data = list("001_DCM", 125L, structure(1L, .Label = "DCM", class = "factor"), structure(1L, .Label = "Transect", class = "factor"), structure(1L, .Label = "STZ", class = "factor"),
structure(1L, .Label = "STFW", class = "factor"), "Oligotrophic",
16L, -149.9978333, -29.997, 130.634, 17.1252, 35.4443, 1025.835008,
1.1968, 1e-12, 5.387, 2.8469, 52.26978546, 98.0505, 0, 0,
0.02, 0.9, 0, 0, 2069.47, 8.057, 377.3), names = c("Station_neat", "Depth_our", "Depth_bin", "Loc", "Front", "Water", "Zone", "Bottle", "Lon", "Lat", "pressure..db.", "Temperature", "Salinity", "Density_kgm.3", "Fluorescence_ugL", "PAR", "BottleO2_mLL", "CTDO2._mLL", "OxygenSat_.", "Beam_Transmission", "N_umolL", "NO3_umolL", "PO4_umolL", "SIL_umolL", "NO2_umolL", "NH4_umolL", "DIC_uMkg", "pH", "pCO2_matm"), row.names = "2-1-DCM_S21_L001_R1_001.fastq",
.S3Class = "data.frame")
You can wrap your code in a for loop to do so. I've slightly modified your code to make it a bit more flexible, see below.
require("phyloseq")
require("dplyr")
# Calculate alpha diversity measures for a specific taxon at a specified rank.
# You can pass any parameters that you normally pass to `estimate_richness`
estimate_diversity_for_taxon <- function(ps, taxon_name, tax_rank = "Phylum", ...){
# Subset to taxon of interest
tax_tbl <- as.data.frame(tax_table(ps))
keep <- tax_tbl[,tax_rank] == taxon_name
keep[is.na(keep)] <- FALSE
ps_phylum <- prune_taxa(keep, ps)
# Calculate alpha diversity and generate a table
alpha_diversity <- estimate_richness(ps_phylum, ...)
alpha_diversity$taxon <- taxon_name
alpha_diversity$sample_id <- row.names(alpha_diversity)
return(alpha_diversity)
}
# Load data
data(GlobalPatterns)
ps <- GlobalPatterns
# Estimate alpha diversity for each phylum
phyla <- get_taxa_unique(ps,
taxonomic.rank = 'Phylum')
phyla <- phyla[!is.na(phyla)]
alpha <- data.frame()
for (phylum in phyla){
a <- estimate_diversity_for_taxon(ps = ps,
taxon_name = phylum,
measure = c("Shannon", "Observed"))
alpha <- rbind(alpha, a)
}
# Calculate the additional alpha diversity measures
alpha$S <- log(alpha$Observed)
alpha$evenness <- alpha$Shannon/alpha$S
# Add sample data
smpl_data <- as.data.frame(sample_data(ps))
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "X.SampleID"))
This is a reproducible example with GlobalPatterns. Make sure to alter the code to match your data by replacing X.SampleID in the left join with the name of the column that contains the sample IDs in your sample_data. If there is no such column, you can create it from the row names:
smpl_data <- as.data.frame(sample_data(ps))
smpl_data$sample_id < row.names(smpl_data)
alpha <- left_join(alpha,
smpl_data,
by = c("sample_id" = "sample_id"))

execute different functions considering output in r

Let's say I have 2 different functions to apply. For example, these functions are max and min . After applying bunch of functions I am getting outputs below. I want to assign a function to each output.
Here is my data and its structure.
data<-structure(list(Apr = structure(list(`a1` = structure(list(
date = c("04-01-2036", "04-02-2036", "04-03-2036"), value = c(0,
3.13, 20.64)), .Names = c("date", "value"), row.names = 92:94, class = "data.frame"),
`a2` = structure(list(date = c("04-01-2037", "04-02-2037",
"04-03-2037"), value = c(5.32, 82.47, 15.56)), .Names = c("date",
"value"), row.names = 457:459, class = "data.frame")), .Names = c("a1",
"a2")), Dec = structure(list(`d1` = structure(list(
date = c("12-01-2039", "12-02-2039", "12-03-2039"), value = c(3,
0, 11)), .Names = c("date", "value"), row.names = 1431:1433, class = "data.frame"),
`d2` = structure(list(date = c("12-01-2064", "12-02-2064",
"12-03-2064"), value = c(0, 5, 0)), .Names = c("date", "value"
), row.names = 10563:10565, class = "data.frame")), .Names = c("d1",
"d2"))), .Names = c("Apr", "Dec"))
I applied these functions:
drop<-function(y){
lapply(y, function(x)(x[!(names(x) %in% c("date"))]))
}
q1<-lapply(data, drop)
q2<-lapply(q1, function(x) unlist(x,recursive = FALSE))
daily_max<-lapply(q2, function(x) lapply(x, max))
dailymax <- data.frame(matrix(unlist(daily_max), nrow=length(daily_max), byrow=TRUE))
row.names(dailymax)<-names(daily_max)
max_value <- apply(dailymax, 1, which.max)
And I'm getting
Apr Dec
2 1
And I am applying any random function to both Apr[2] and Dec[1] like:
Map(function(x, y) sum(x[[y]]), q2, max_value)
So, the function will be executed considering the outputs (to Apr's second element which is a1, Dec's first element which is a2.) As you can see, there are outputs as numbers 1 and 2.
What I want
What I want is assigning specific functions to 1 and 2. If output is 1 then max function; if it is 2, min function will be executed. In conclusion, max function will be applied to Apr[2] and min function will be applied to Dec[1].
I will get this:
min(q2$Apr$a2.value)
[1] 5.32
max(q2$Dec$d2.value)
[1] 5
How can I achieve this automatically for all my functions?
You can take help of switch here to apply a function based on number in max_value.
apply_function <- function(x, num) switch(num, `1` = max, `2` = min)(x)
Map(function(x, y) apply_function(x[[y]], y), q2, max_value)
#$Apr
#[1] 5.32
#$Dec
#[1] 11
Map returns a list if you want a vector output use mapply.

How to select one value of a data.frame within a list column with R?

I have a data.frame that contains a type column. The list contains a 1x3 data.frame. I only want one value from this list. Thus will flatten my data.frame so I can write out a csv.
How do I select one item from the nested data.frame (see the 2nd column)?
Here's the nested col. I'd provide the data but cannot flatten to write_csv.
result of dput:
structure(list(id = c("1386707", "1386700", "1386462", "1386340",
"1386246", "1386300"), fields.created = c("2020-05-07T02:09:27.000-0700",
"2020-05-07T01:20:11.000-0700", "2020-05-06T21:38:14.000-0700",
"2020-05-06T07:19:44.000-0700", "2020-05-06T06:11:43.000-0700",
"2020-05-06T02:26:44.000-0700"), fields.customfield_10303 = c(NA,
NA, 3, 3, NA, NA), fields.customfield_28100 = list(NULL, structure(list(
self = ".../rest/api/2/customFieldOption/76412",
value = "New Feature", id = "76412"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), structure(list(
self = ".../rest/api/2/customFieldOption/76414",
value = "Technical Debt", id = "76414"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), NULL,
structure(list(self = ".../rest/api/2/customFieldOption/76411",
value = "Maintenance", id = "76411"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L), structure(list(
self = ".../rest/api/2/customFieldOption/76412",
value = "New Feature", id = "76412"), .Names = c("self",
"value", "id"), class = "data.frame", row.names = 1L))), row.names = c(NA,
6L), class = "data.frame", .Names = c("id", "fields.created",
"fields.customfield_10303", "fields.customfield_28100"))
I found a way to do this.
First, instead of changing the data, I added a column with mutate. Then, directly selected the same column from all nested lists. Then, I converted the list column into a vector. Finally, I cleaned it up by removing the other columns.
It seems to work. I don't know yet how it will handle multiple rows within the nested df.
dat <- sample_dat %>%
mutate(cats = sapply(nested_col, `[[`, 2)) %>%
mutate(categories = sapply(cats, toString)) %>%
select(-nested_col, -cats)
Related
How to directly select the same column from all nested lists within a list?
r-convert list column into character vector where lists are characters
library(dplyr)
library(tidyr)
df <- tibble(Group=c("A","A","B","C","D","D"),
Batman=1:6,
Superman=c("red","blue","orange","red","blue","red"))
nested <- df %>%
nest(data=-Group)
unnested <- nested %>%
unnest(data)
Nesting and unnesting data with tidyr
library(purrr)
nested %>%
mutate(data=map(data,~select(.x,2))) %>%
unnest(data)
select with purrr, but lapply as you've done is fine, it's just for aesthetics ;)

Using dygraph with xts-object drops Label in plot

I have a seemingly small challenge, but I can't get to an answer. Here is my minimum working example.
fr_nuke <- structure(list(Date = structure(c(1420070400, 1420074000, 1420077600,
1420081200, 1420084800, 1420088400), class = c("POSIXct", "POSIXt"), tzone = ""),
`61` = c(57945, 57652, 57583, 57551, 57465, 57683),
`3244` = c(72666.64, 73508.78, 69749.17, 67080.13, 66357.65, 66524.13),
`778` = c(2.1133, 2.1133, 2.1133, 2.1133, 2.1133, 2.1133),
fcasted_nuke_temp = c(54064.6099092888, 54064.6099092888, 54064.6099092888,
54064.6099092888, 54064.6099092888, 54064.6099092888),
fcasted_nuke_cons = c(55921.043096775, 56319.5688170977, 54540.4094334057,
53277.340242333, 52935.4411965463, 53014.2244890147)),
.Names = c("Date", "61", "3244", "778", "fcasted_nuke_temp", "fcasted_nuke_cons"),
row.names = c(NA, 6L), class = "data.frame")
series1 <- as.xts(fr_nuke$'61', fr_nuke$Date)
series2 <- as.xts(fr_nuke$fcasted_nuke_temp, fr_nuke$Date)
series3 <- as.xts(fr_nuke$fcasted_nuke_cons, fr_nuke$Date)
grp_input <- cbind(series1,series2,series3)
dygraph(grp_input)
The resulting plot does not show the label of the individual series. Specifying the series with
dygraph(grp_input) %>% dySeries("V1", label = "Label1")
Results in:
Error in dySeries(., "V1", label = "Label1") : One or more of the
specified series were not found. Valid series names are: ..1, ..2, ..3
However, it works if I plot only one series (e.g. series1).
dygraph(series1) %>% dySeries("V1", label = "Label1")
Either set the colnames for the grp_input object, or use merge to construct the column names from the object names.
# setting colnames
require(dygraphs)
require(xts)
grp_input <- cbind(series1, series2, series3)
colnames(grp_input) <- c("V1", "V2", "V3")
dygraph(grp_input) %>% dySeries("V1", label = "Label1")
# using merge
require(dygraphs)
require(xts)
grp_input <- merge(series1, series2, series3)
dygraph(grp_input) %>% dySeries("series1", label = "Label1")

prop.table doesn't work in a for-loop?

This may be a very simple question, but I don't see how to answer it.
I have the following reproducible code, where I have two small dataframes that I use to calculate a percentage value based on each column total:
#dataframe x
x <- structure(list(PROV = structure(c(1L, 1L), .Label = "AG", class = "factor"),
APT = structure(1:2, .Label = c("AAA", "BBB"), class = "factor"),
PAX.2013 = c(5L, 4L), PAX.2014 = c(4L, 2L), PAX.2015 = c(4L,0L)),
.Names = c("PROV", "APT", "PAX.2013", "PAX.2014", "PAX.2015"),
row.names = 1:2, class = "data.frame")
#dataframe y
y <- structure(list(PROV = structure(c(1L, 1L), .Label = "AQ", class = "factor"),
APT = structure(1:2, .Label = c("CCC", "AAA"), class = "factor"),
PAX.2013 = c(3L, 7L), PAX.2014 = c(2L, 1L), PAX.2015 = c(0L,3L)),
.Names = c("PROV", "APT", "PAX.2013", "PAX.2014", "PAX.2015"),
row.names = 1:2, class = "data.frame")
#list z (with x and y)
z <- list(x,y)
#percentage value of x and y based on columns total
round(prop.table(as.matrix(z[[1]][3:5]), margin = 2)*100,1)
round(prop.table(as.matrix(z[[2]][3:5]), margin = 2)*100,1)
as you can see, it works just fine.
Now I want to automate for all the list, but I can't figure out how to get the results. This is my simple code:
#for-loop that is not working
for (i in length(z))
{round(prop.table(as.matrix(z[[i]][3:5]), margin = 2)*100,1)}
You have two problems.
First, you have not put a range into your for loop so you are just trying to iterate over a single number and second, you are not assigning your result anywhere on each iteration.
Use 1:length(z) to define a range. Then assign the results to a variable.
This would work:
my_list <- list()
for (i in 1:length(z)){
my_list[[i]] <- round(prop.table(as.matrix(z[[i]][3:5]),
margin = 2)*100,1)
}
my_list
But it would be more efficient and idiomatic to use lapply:
lapply(1:length(z),
function(x) round(prop.table(as.matrix(z[[x]][3:5]), margin = 2)*100,1))
Barring discussions whether for-loops is the best approach, you had two issues. One, your for loop only iterates over 2 (which is length(z)) instead of 1:2. Two, you need to do something with the round(....) statement. In this solution, I added a print statement.
for (i in 1:length(z)){
print(round(prop.table(as.matrix(z[[i]][3:5]), margin = 2)*100,1))
}

Resources