execute different functions considering output in r - r

Let's say I have 2 different functions to apply. For example, these functions are max and min . After applying bunch of functions I am getting outputs below. I want to assign a function to each output.
Here is my data and its structure.
data<-structure(list(Apr = structure(list(`a1` = structure(list(
date = c("04-01-2036", "04-02-2036", "04-03-2036"), value = c(0,
3.13, 20.64)), .Names = c("date", "value"), row.names = 92:94, class = "data.frame"),
`a2` = structure(list(date = c("04-01-2037", "04-02-2037",
"04-03-2037"), value = c(5.32, 82.47, 15.56)), .Names = c("date",
"value"), row.names = 457:459, class = "data.frame")), .Names = c("a1",
"a2")), Dec = structure(list(`d1` = structure(list(
date = c("12-01-2039", "12-02-2039", "12-03-2039"), value = c(3,
0, 11)), .Names = c("date", "value"), row.names = 1431:1433, class = "data.frame"),
`d2` = structure(list(date = c("12-01-2064", "12-02-2064",
"12-03-2064"), value = c(0, 5, 0)), .Names = c("date", "value"
), row.names = 10563:10565, class = "data.frame")), .Names = c("d1",
"d2"))), .Names = c("Apr", "Dec"))
I applied these functions:
drop<-function(y){
lapply(y, function(x)(x[!(names(x) %in% c("date"))]))
}
q1<-lapply(data, drop)
q2<-lapply(q1, function(x) unlist(x,recursive = FALSE))
daily_max<-lapply(q2, function(x) lapply(x, max))
dailymax <- data.frame(matrix(unlist(daily_max), nrow=length(daily_max), byrow=TRUE))
row.names(dailymax)<-names(daily_max)
max_value <- apply(dailymax, 1, which.max)
And I'm getting
Apr Dec
2 1
And I am applying any random function to both Apr[2] and Dec[1] like:
Map(function(x, y) sum(x[[y]]), q2, max_value)
So, the function will be executed considering the outputs (to Apr's second element which is a1, Dec's first element which is a2.) As you can see, there are outputs as numbers 1 and 2.
What I want
What I want is assigning specific functions to 1 and 2. If output is 1 then max function; if it is 2, min function will be executed. In conclusion, max function will be applied to Apr[2] and min function will be applied to Dec[1].
I will get this:
min(q2$Apr$a2.value)
[1] 5.32
max(q2$Dec$d2.value)
[1] 5
How can I achieve this automatically for all my functions?

You can take help of switch here to apply a function based on number in max_value.
apply_function <- function(x, num) switch(num, `1` = max, `2` = min)(x)
Map(function(x, y) apply_function(x[[y]], y), q2, max_value)
#$Apr
#[1] 5.32
#$Dec
#[1] 11
Map returns a list if you want a vector output use mapply.

Related

How to cbind a list of tables by one column, and suffix headings with the list item name

I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp

apply any function to elements which returned from which.max function in r

I want to apply any function to elements which I found its location with which.max function. For example, my sample data is below:
$Apr
$Apr$`04-2036`
date value
92 04-01-2036 0.00
93 04-02-2036 3.13
94 04-03-2036 20.64
$Apr$`04-2037`
date value
457 04-01-2037 5.32
458 04-02-2037 82.47
459 04-03-2037 15.56
$Dec
$Dec$`04-2039`
date value
1431 12-01-2039 3
1432 12-02-2039 0
1433 12-03-2039 11
$Dec$`04-2064`
date value
10563 12-01-2064 0
10564 12-02-2064 5
10565 12-03-2064 0
data<-structure(list(Apr = structure(list(`04-2036` = structure(list(
date = c("04-01-2036", "04-02-2036", "04-03-2036"), value = c(0,
3.13, 20.64)), .Names = c("date", "value"), row.names = 92:94, class = "data.frame"),
`04-2037` = structure(list(date = c("04-01-2037", "04-02-2037",
"04-03-2037"), value = c(5.32, 82.47, 15.56)), .Names = c("date",
"value"), row.names = 457:459, class = "data.frame")), .Names = c("04-2036",
"04-2037")), Dec = structure(list(`04-2039` = structure(list(
date = c("12-01-2039", "12-02-2039", "12-03-2039"), value = c(3,
0, 11)), .Names = c("date", "value"), row.names = 1431:1433, class = "data.frame"),
`04-2064` = structure(list(date = c("12-01-2064", "12-02-2064",
"12-03-2064"), value = c(0, 5, 0)), .Names = c("date", "value"
), row.names = 10563:10565, class = "data.frame")), .Names = c("04-2039",
"04-2064"))), .Names = c("Apr", "Dec"))
I have found locations of maximum values for each element in lists of list using the functions below.
drop<-function(y){
lapply(y, function(x)(x[!(names(x) %in% c("date"))]))
}
q1<-lapply(data, drop)
q2<-lapply(q1, function(x) unlist(x,recursive = FALSE))
daily_max<-lapply(q2, function(x) lapply(x, max))
dailymax <- data.frame(matrix(unlist(daily_max), nrow=length(daily_max), byrow=TRUE))
row.names(dailymax)<-names(daily_max)
apply(dailymax, 1, which.max)
Locations of max. values of each element is computed as it is seen below;
Apr Dec
2 1
Now, I want to apply any function to these elements automatically for all my data(it is Apr 2 = Apr$04-2037 and Dec$2039).
You can subset and keep only the max value data in each list.
max_value <- apply(dailymax, 1, which.max)
Map(`[[`, data, max_value)
#$Apr
# date value
#457 04-01-2037 5.32
#458 04-02-2037 82.47
#459 04-03-2037 15.56
#$Dec
# date value
#1431 12-01-2039 3
#1432 12-02-2039 0
#1433 12-03-2039 11
Let's say you want to apply the function fn to this list.
fn <- function(x) {x$value <- x$value * 2;x}
You can change the Map function as -
Map(function(x, y) fn(x[[y]]), data, max_value)
#$Apr
# date value
#457 04-01-2037 10.64
#458 04-02-2037 164.94
#459 04-03-2037 31.12
#$Dec
# date value
#1431 12-01-2039 6
#1432 12-02-2039 0
#1433 12-03-2039 22

Automatically split function output (list) into component data.frames

I have a functions which yields 2 dataframes. As functions can only return one object, I combined these dataframes as a list. However, I need to work with both dataframes separately. Is there a way to automatically split the list into the component dataframes, or to write the function in a way that both objects are returned separately?
The function:
install.packages("plyr")
require(plyr)
fun.docmerge <- function(x, y, z, crit, typ, doc = checkmerge) {
mergedat <- paste(deparse(substitute(x)), "+",
deparse(substitute(y)), "=", z)
countdat <- nrow(x)
check_t1 <- data.frame(mergedat, countdat)
z1 <- join(x, y, by = crit, type = typ)
countdat <- nrow(z1)
check_t2 <- data.frame(mergedat, countdat)
doc <- rbind(doc, check_t1, check_t2)
t1<-list()
t1[["checkmerge"]]<-doc
t1[[z]]<-z1
return(t1)
}
This is the call to the function, saving the result list to the new object results.
results <- fun.docmerge(x = df1, y = df2, z = "df3", crit = c("id"), typ = "left")
In the following sample data to replicate the problem:
df1 <- structure(list(id = c("XXX1", "XXX2", "XXX3",
"XXX4"), tr.isincode = c("ISIN1", "ISIN2",
"ISIN3", "ISIN4")), .Names = c("id", "isin"
), row.names = c(NA, 4L), class = "data.frame")
df2 <- structure(list(id= c("XXX1", "XXX5"), wrong= c(1L,
1L)), .Names = c("id", "wrong"), row.names = 1:2, class = "data.frame")
checkmerge <- structure(list(mergedat = structure(integer(0), .Label = character(0), class = "factor"),
countdat = numeric(0)), .Names = c("mergedat", "countdat"
), row.names = integer(0), class = "data.frame")
In the example, a list with the dataframes df3 and checkmerge are returned. I would need both dataframes separately. I know that I could do it via manual assignment (e.g., checkmerge <- results$checkmerge) but I want to eliminate manual changes as much as possible and am therefore looking for an automated way.

Arithmetic on summarized dataframe from dplyr in R

I have a large dataset I use dplyr() summarize to generate some means.
Occasionally, I would like to perform arithmetic on that output.
For example, I would like to get the mean of means from the output below, say "m.biomass".
I've tried this mean(data.sum[,7]) and this mean(as.list(data.sum[,7])). Is there a quick and easy way to achieve this?
data.sum <-structure(list(scenario = c("future", "future", "future", "future"
), state = c("fl", "ga", "ok", "va"), m.soc = c(4090.31654013689,
3654.45350562628, 2564.33199749487, 4193.83388887064), m.npp = c(1032.244475,
821.319385, 753.401315, 636.885535), sd.soc = c(56.0344229400332,
97.8553643582118, 68.2248389927858, 79.0739969429246), sd.npp = c(34.9421782033153,
27.6443555578531, 26.0728757486901, 24.0375040705595), m.biomass = c(5322.76631158111,
3936.79457763176, 3591.0902359206, 2888.25308402464), sd.m.biomass = c(3026.59250918009,
2799.40317348016, 2515.10516340438, 2273.45510178843), max.biomass = c(9592.9303,
8105.109, 7272.4896, 6439.2259), time = c("1980-1999", "1980-1999",
"1980-1999", "1980-1999")), .Names = c("scenario", "state", "m.soc",
"m.npp", "sd.soc", "sd.npp", "m.biomass", "sd.m.biomass", "max.biomass",
"time"), class = c("grouped_df", "tbl_df", "tbl", "data.frame"
), row.names = c(NA, -4), vars = list(quote(scenario)), labels = structure(list(
scenario = "future"), class = "data.frame", row.names = c(NA,
-1), vars = list(quote(scenario)), drop = TRUE, .Names = "scenario"), indices = list(0:3))
We can use [[ to extract the column as a vector; as mean only works on a vector or a matrix -- not on a data.frame. If the OP wanted to do this on a single column, use this:
mean(data.sum[[7]])
#[1] 3934.726
If there was only the data.frame class, the data.sum[,7] would be extracting it as a vector, but the tbl_df prevents it to collapse it to vector
For multiple columns, the dplyr also has specialised functions
data.sum %>%
summarise_each(funs(mean), 3:7)

Convert a list in a dataframe to a lines

I have a dataframe (test) in R. Inside one of the columns contains coordinates in this list structure:
> dput(test$coordinates)
list(structure(list(x = c(-1.294832, -1.294883, -1.294262,
-1.249478), y = c(54.61024, 54.61008, 54.610016, 54.610006
)), .Names = c("x", "y"), row.names = c(NA, -284L), class = c("tbl_df",
"tbl", "data.frame")))
I've reduced the number of coordinates for clarity.
Ultimately I wish to convert the dataframe into a spaitial lines dataframe but to do that I need the test$coordinates in a lines form. However, I get the following error
> lines(test$coordinates)
Error in xy.coords(x, y) :
'x' is a list, but does not have components 'x' and 'y'
I have tried to convert the test$coordinates to other forms but it usually results in some error. How do I transform this list into a line?
Extra info this is a follow up question to
Convert data frame to spatial lines data frame in R with x,y x,y coordintates
UPDATE as requested dput(head(test)):
> dput(head(test))
structure(list(rid = 1, start_id = 1L, start_code = "E02002536",
end_id = 106L, end_code = "E02006909", strategy = "fastest",
distance = 12655L, time_seconds = 2921L, calories = 211L,
document.id = 1L, array.index = 1L, start = "Geranium Close",
finish = "Hylton Road", startBearing = 0, startSpeed = 0,
start_longitude = -1.294832, start_latitude = 54.610241,
finish_longitude = -1.249478, finish_latitude = 54.680691,
crow_fly_distance = 8362, event = "depart", whence = 1473171787,
speed = 20, itinerary = 419956, clientRouteId = 0, plan = "fastest",
note = "", length = 12655, time = 2921, busynance = 42172,
quietness = 30, signalledJunctions = 3, signalledCrossings = 2,
west = -1.300074, south = 54.610006, east = -1.232447, north = 54.683814,
name = "Geranium Close to Hylton Road", walk = 0, leaving = "2016-09-06 15:23:07",
arriving = "2016-09-06 16:11:48", grammesCO2saved = 2359,
calories2 = 211, type = "route", coordinates = list(structure(list(
x = c(-1.294832, -1.294883, -1.294262, -1.294141, -1.29371,
-1.293726, -1.293742, -1.29351, -1.293368, -1.292816,
-1.248019, -1.249478), y = c(54.61024, 54.61008, 54.610016,
54.610006, 54.610038, 54.610142, 54.610247, 54.610262,
54.681238, 54.680975, 54.680601, 54.680404
)), .Names = c("x", "y"), row.names = c(NA, -284L), class = c("tbl_df",
"tbl", "data.frame")))), .Names = c("rid", "start_id", "start_code",
"end_id", "end_code", "strategy", "distance", "time_seconds",
"calories", "document.id", "array.index", "start", "finish",
"startBearing", "startSpeed", "start_longitude", "start_latitude",
"finish_longitude", "finish_latitude", "crow_fly_distance", "event",
"whence", "speed", "itinerary", "clientRouteId", "plan", "note",
"length", "time", "busynance", "quietness", "signalledJunctions",
"signalledCrossings", "west", "south", "east", "north", "name",
"walk", "leaving", "arriving", "grammesCO2saved", "calories2",
"type", "coordinates"), row.names = c(NA, -1L), class = c("tbl_df",
"tbl", "data.frame"))
lines is a plotting function. I'm assuming you want sp::SpatialLines. See ?"SpatialLines-class" for how to construct such an object.
Here's for your case, provided you don't have a "corrupt" data.frame (see at the bottom of this post).
library(sp)
coords <- as.data.frame(xy$coordinates[[1]])[1:12, ]
out <- SpatialLines(list(Lines(list(Line(coords)), ID = 1)))
An object of class "SpatialLines"
Slot "lines":
[[1]]
An object of class "Lines"
Slot "Lines":
[[1]]
An object of class "Line"
Slot "coords":
x y
1 -1.294832 54.61024
2 -1.294883 54.61008
3 -1.294262 54.61002
4 -1.294141 54.61001
5 -1.293710 54.61004
6 -1.293726 54.61014
7 -1.293742 54.61025
8 -1.293510 54.61026
9 -1.293368 54.68124
10 -1.292816 54.68097
11 -1.248019 54.68060
12 -1.249478 54.68040
Slot "ID":
[1] "1"
Slot "bbox":
min max
x -1.294883 -1.248019
y 54.610006 54.681238
Slot "proj4string":
CRS arguments: NA
To add data to this object, you should use
SpatialLinesDataFrame(out, data = yourdata)
but see this example for more info.
There's a warning when I tried to coerce your coordinates to a data.frame. Hopefully this isnt' the case for your dataset.
> as.data.frame(xy$coordinates[[1]])
x y
1 -1.294832 54.61024
2 -1.294883 54.61008
3 -1.294262 54.61002
...
281 <NA> <NA>
282 <NA> <NA>
283 <NA> <NA>
284 <NA> <NA>
Warning message:
In format.data.frame(x, digits = digits, na.encode = FALSE) :
corrupt data frame: columns will be truncated or padded with NAs

Resources