Error with a function inside a Lapply() R - r

I'm having a very strange error in a script that used to work perfectly and I don't know what's the problem. I start creating a very long list with several data frames with the exact number of columns. The list is called lst. Then I want to do a summarise table with means and sd. Here is the script for that:
w1 <- lapply(lst, function(i) t(cbind(Mean = colMeans(i[, c(6,7,8,9)], na.rm = TRUE),
Sds = colSds(as.matrix(i[, c(6,7,8,9)]), na.rm = TRUE),
N = length(i[,2]),
len.max=max(i[,6]))))
The number of the columns are correct. However when I run the script first I get the Debug location and when I stopped I get this error message:
Error in t(cbind(Mean = colMeans(i[, c(6, 7, 8, 9)], na.rm = TRUE), Sds = colSds(as.matrix(i[, :
error in evaluating the argument 'x' in selecting a method for function 't': Error in `[.data.frame`(i, , c(6, 7, 8, 9)) : undefined columns selected
I dont know whats wrong with the function. I try to search in the internet and I saw something about change as,matrix for data.matrix. However this does not make the trick.
Indeed I get the same problem for another function very similar:
a1 <- lapply(lst, function(i) t(cbind(l1 = NROW(which(i[,6]>1)),
l1.05 = NROW(which(i[,6]<=1)) - NROW(which(i[,6]>0.5)),
l05.03 = NROW(which(i[,6]>0.3)) - NROW(which(i[,6]<=0.5)),
l03 = NROW(which(i[,6]<=0.3)))))
With the same outcome:
Error in t(cbind(l1 = NROW(which(i[, 6] > 1)), l1.05 = NROW(which(i[, :
error in evaluating the argument 'x' in selecting a method for function 't': Error in `[.data.frame`(i, , 6) : undefined columns selected
Can someone point me out what is the problem. Do you need some data? Thanks!
I'm working with the last RStudio and with the following packages:
plyr, matrixStats,dplyr
Here is an example of the list:
> lst
[[1]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 12900000 13700000 0.9 284.7560 23.77778 7.952434
2 2 1 2 17000000 17300000 0.4 126.5582 16.00000 5.351171
3 3 1 3 21200000 21500000 0.4 126.5582 40.75000 13.628763
4 4 1 4 45300000 45700000 0.5 158.1978 23.20000 7.759197
5 5 1 5 45900000 46600000 0.8 253.1165 31.12500 10.409699
[[2]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 12900000 13700000 0.9 312.90267 24.44444 4.288499
2 2 1 2 21200000 21500000 0.4 139.06785 38.00000 6.666667
3 3 1 3 32600000 33000000 0.5 173.83482 28.40000 4.982456
4 4 1 4 35800000 36100000 0.4 139.06785 37.25000 6.535088
5 5 1 5 36300000 36300000 0.1 34.76696 22.00000 3.859649
[[3]]
X Chr new pos1 pos2 len nsnp n.ind per.ind
1 1 1 1 35700000 36500000 0.9 287.4214 12.22222 11.42264
2 2 1 2 45900000 46600000 0.8 255.4857 12.50000 11.68224
3 3 1 3 49400000 50700000 1.4 447.1000 21.78571 20.36048
4 4 1 4 51000000 52000000 1.1 351.2929 16.00000 14.95327
5 5 1 5 52200000 53000000 0.9 287.4214 19.66667 18.38006
dput(lst[1:3])
list(structure(list(X = 1:5, Chr = c(1L, 1L, 1L, 1L, 1L), new = 1:5,
pos1 = c(12900000, 1.7e+07, 21200000, 45300000, 45900000),
pos2 = c(13700000, 17300000, 21500000, 45700000, 46600000
), len = c(0.9, 0.4, 0.4, 0.5, 0.8), nsnp = c(284.756031128405,
126.558236057069, 126.558236057069, 158.197795071336, 253.116472114137
), n.ind = c(23.7777777777778, 16, 40.75, 23.2, 31.125),
per.ind = c(7.95243403939056, 5.35117056856187, 13.628762541806,
7.75919732441472, 10.4096989966555)), .Names = c("X", "Chr",
"new", "pos1", "pos2", "len", "nsnp", "n.ind", "per.ind"), row.names = c(NA,
5L), class = "data.frame"), structure(list(X = 1:5, Chr = c(1L,
1L, 1L, 1L, 1L), new = 1:5, pos1 = c(12900000, 21200000, 32600000,
35800000, 36300000), pos2 = c(13700000, 21500000, 3.3e+07, 36100000,
36300000), len = c(0.9, 0.4, 0.5, 0.4, 0.1), nsnp = c(312.90267141585,
139.0678539626, 173.83481745325, 139.0678539626, 34.76696349065
), n.ind = c(24.4444444444444, 38, 28.4, 37.25, 22), per.ind = c(4.28849902534113,
6.66666666666667, 4.98245614035088, 6.53508771929825, 3.85964912280702
)), .Names = c("X", "Chr", "new", "pos1", "pos2", "len", "nsnp",
"n.ind", "per.ind"), row.names = c(NA, 5L), class = "data.frame"),
structure(list(X = 1:5, Chr = c(1L, 1L, 1L, 1L, 1L), new = 1:5,
pos1 = c(35700000, 45900000, 49400000, 5.1e+07, 52200000
), pos2 = c(36500000, 46600000, 50700000, 5.2e+07, 5.3e+07
), len = c(0.9, 0.8, 1.4, 1.1, 0.9), nsnp = c(287.421428571429,
255.485714285714, 447.1, 351.292857142857, 287.421428571429
), n.ind = c(12.2222222222222, 12.5, 21.7857142857143,
16, 19.6666666666667), per.ind = c(11.4226375908619,
11.6822429906542, 20.3604806408545, 14.9532710280374,
18.380062305296)), .Names = c("X", "Chr", "new", "pos1",
"pos2", "len", "nsnp", "n.ind", "per.ind"), row.names = c(NA,
5L), class = "data.frame"))

Related

Simple but not easy merge task

I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.
df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")
desired output is:
Datum Vorhersage Export color Status Plausibilität min max Referenz
1 2021-10-25 14:00:00 10.3 10.912 rgb(0,128,0) ok 4 7.9 12 NA
2 2021-10-25 15:00:00 10.2 10.47 rgb(0,128,0) ok 4 9.4 12 NA
3 2021-10-25 16:00:00 10.0 <NA> <NA> <NA> <NA> 8.7 13 NA
4 2021-10-25 17:00:00 9.7 <NA> <NA> <NA> <NA> 8.3 12 NA
5 2021-09-24 21:00:00 14.8 17.198 rgb(0,128,0) ok 4 13.1 17 17
6 2021-09-24 22:00:00 14.8 16.713 rgb(0,128,0) ok 4 12.8 17 16
7 2021-09-24 23:00:00 14.7 16.378 rgb(0,128,0) ok 4 11.6 17 16
8 2021-09-25 00:00:00 14.6 16.358 rgb(0,128,0) ok 4 12.3 17 16
DWD_Name
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel
# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400,
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct",
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513,
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912",
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)",
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)",
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok",
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891,
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625,
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2,
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel",
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")
Thanks to the help of #r2evans I tried the following:
by = "Datum"
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
but I get the following error:
Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) :
Item 2 is type double but the first item is type character. Please coerce
Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.
Here's a quick solution that uses dynamic column names.
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = "by", all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
# by b c a
# <num> <num> <num> <num>
# 1: 1 1 NA 1
# 2: 2 NA 2 2
# 3: 3 3 3 3
# 4: 4 NA 4 4
Notes:
the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.
This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
This will give the following result.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
I know you specified base, by the natural_join() function is worth mentioning.
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
This gives exactly what you want.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
Not the answer with R base. But one possible solution with the package data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Edit with the help of #r2evans, A much more elegant and efficient solution:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Created on 2021-10-19 by the reprex package (v2.0.1)
here a dynamic solution.. not bad, but maybe someone knows how to speed it up.
get_complete_df<-function(df_a,df_b, by = "by"){
df_a = unique(df_a)
df_b = unique(df_b)
nam_a = names(df_a)[!(names(df_a) == by)]
nam_b = names(df_b)[!(names(df_b) == by)]
nums_a = unlist(lapply(df_a, is.numeric))
nums_b = unlist(lapply(df_b, is.numeric))
nums = unique(names(df_a)[nums_a],names(df_b)[nums_b])
## try to supplement NAs
x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
y = nam_b[nam_b %in% nam_a]
vna = is.na(df_a[df_a[,1] %in% x,y])
df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
## get complete df
all_names = c(nam_a,nam_b )
all_names = c(by, unique(all_names))
all_by = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
## build
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
## fill in content
df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
# fix numeric:
# why did some(!) num fields changed to chr ?
df_o[,nums] = as.data.frame(apply(df_o[,nums], 2, as.numeric))
df_o
}

Pivot dataset and column names [R]

I have a dataset that I want to pivot.
dataset <- data.frame(date = c("01/01/2020","02/01/2020", "02/01/2020", "03/01/2020")
, camp_type = c("acquisition", "acquisition", "newsletter", "acquisition")
, channel_type = c("email", "direct_mail","email","email")
, sent = c(100, 200, 50, 250)
, open = c(30, NA, 14, 148)
, click = c(14, NA, 1, 100)
)
PLEASE NOTE: I have many more camp_types than the ones displayed in this example.
I want to get one row per day, and the rest of the information in different columns such as the picture below (renaming the columns "sent", "open" and "click" based on "channel_type" and "camp_type").
I have tried something not very elegant, and entirely manual, but I get an error when I rename the variables (code below)
dataset %>%
filter(camp_type == 'Acquisition' & channel_type == 'direct_mail') %>%
rename (dm_acq_sent = sent
, dm_acq_open = open
, dm_acq_click = clicked
)
The problem with this code above is that (once I fix the renaming issue) it will be heavily manual because I have to repeat the same chunk of code several times and needs that someone regularly checks that there are no more combinations of camp_type and channel_type.
Any help / advise will be massively appreciated.
With tidyr you can use pivot_wider:
library(tidyr)
pivot_wider(df, id_cols = date, names_from = c(camp_type, channel_type), values_from = c(sent, open, click))
Output
# A tibble: 3 x 10
date sent_acquisition… sent_acquisition_… sent_newsletter_… open_acquisitio… open_acquisition… open_newsletter… click_acquisiti… click_acquisitio… click_newslette…
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2020-01-01 100 NA NA 30 NA NA 14 NA NA
2 2020-02-01 NA 200 50 NA NA 14 NA NA 1
3 2020-03-01 250 NA NA 148 NA NA 100 NA NA
Data
df <- structure(list(date = structure(c(18262, 18293, 18293, 18322), class = "Date"),
camp_type = structure(c(1L, 1L, 2L, 1L), .Label = c("acquisition",
"newsletter"), class = "factor"), channel_type = structure(c(2L,
1L, 2L, 2L), .Label = c("direct_email", "email"), class = "factor"),
sent = c(100, 200, 50, 250), open = c(30, NA, 14, 148), click = c(14,
NA, 1, 100)), class = "data.frame", row.names = c(NA, -4L
))

R: recursive algorithm for querying a binary tree

I have a tree called mytree that looks like the following:
In R, I have it stored as a list:
mytree <- list(left = structure(list(y = -10, x = 10, grad = -10.5, sim_score = 110.25,
value = -10.5, criterion = "x < 15"), row.names = 1L, class = "data.frame"),
right = list(left = list(left = structure(list(y = 7, x = 20,
grad = 6.5, sim_score = 42.25, value = 6.5, criterion = "x < 22.5"), row.names = 2L, class = "data.frame"),
right = structure(list(y = 8, x = 25, grad = 7.5, sim_score = 56.25,
value = 7.5, criterion = "x >= 22.5"), row.names = 3L, class = "data.frame"),
root = list(root = structure(list(y = c(7, 8), x = c(20,
25), grad = c(6.5, 7.5), sim_score = c(98, 98), value = c(7,
7), criterion = c("x < 30", "x < 30")), row.names = 2:3, class = "data.frame"),
gain = 0.5)), right = structure(list(y = -7, x = 35,
grad = -7.5, sim_score = 56.25, value = -7.5, criterion = "x >= 30"), row.names = 4L, class = "data.frame"),
root = list(root = structure(list(y = c(7, 8, -7), x = c(20,
25, 35), grad = c(6.5, 7.5, -7.5), sim_score = c(14.0833333333333,
14.0833333333333, 14.0833333333333), value = c(2.16666666666667,
2.16666666666667, 2.16666666666667), criterion = c("x >= 15",
"x >= 15", "x >= 15")), row.names = 2:4, class = "data.frame"),
gain = 140.166666666667)), root = list(root = structure(list(
y = c(-10, 7, 8, -7), x = c(10, 20, 25, 35), grad = c(-10.5,
6.5, 7.5, -7.5), sim_score = c(4, 4, 4, 4)), row.names = c(NA,
-4L), class = "data.frame"), gain = 120.333333333333))
which looks like this
$left
y x grad sim_score value criterion
1 -10 10 -10.5 110.25 -10.5 x < 15
$right
$right$left
$right$left$left
y x grad sim_score value criterion
2 7 20 6.5 42.25 6.5 x < 22.5
$right$left$right
y x grad sim_score value criterion
3 8 25 7.5 56.25 7.5 x >= 22.5
$right$left$root
$right$left$root$root
y x grad sim_score value criterion
2 7 20 6.5 98 7 x < 30
3 8 25 7.5 98 7 x < 30
$right$left$root$gain
[1] 0.5
$right$right
y x grad sim_score value criterion
4 -7 35 -7.5 56.25 -7.5 x >= 30
$right$root
$right$root$root
y x grad sim_score value criterion
2 7 20 6.5 14.08333 2.166667 x >= 15
3 8 25 7.5 14.08333 2.166667 x >= 15
4 -7 35 -7.5 14.08333 2.166667 x >= 15
$right$root$gain
[1] 140.1667
$root
$root$root
y x grad sim_score
1 -10 10 -10.5 4
2 7 20 6.5 4
3 8 25 7.5 4
4 -7 35 -7.5 4
$root$gain
[1] 120.3333
The splits are stored under criterion, and the leave values are stored under value.
Given a new data point, x = 5, I would like to query mytree and see which leaf node this instance falls under. For x = 5, my function should output a value of -10.5 because 5 < 15. Similarly, if x = 25, then it should end up in the leaf with the value 7.5. Here are some more examples of what I'd like my pred_tree function to output:
newdata <- data.frame(x = c(5, 19, 18, 30))
> pred_tree(tree = mytree, newdata = newdata)
[1] -10.5
[2] 6.5
[3] 6.5
[4] -7.5
Here's what I have so far:
pred_tree <- function(tree, newdata){
for(i in length(tree)){
# Check if this is a leaf
if(length(tree[[i]]) == 1){
# Check criterion
if(eval(parse(text=tree[[i]]$criterion))){
# Return value of leaf
return(tree[[i]]$value[1])
}
}else if(length(tree[[i]]) > 1){
for(j in 1:length(tree[[i]])){
if(length(tree[[i]][[j]]) == 1){
# Check criterion
if(eval(parse(text=tree[[i]][[j]]$criterion))){
# Return value of leaf
return(tree[[i]][[j]]$value[1])
}
}
}
}
}
}
pred_tree(tree, newdata = newdata)
Unfortunately, this function is not returning the correct output. Also, this is rather clunky and can be very slow if I have many queries to run. I'm guessing using a recursive algorithm would make more sense instead of using nested for loops. Can anyone point me in the right direction?
############# EDIT #############
mytree3 <- list(left = list(left = structure(list(y = -10, x = 10, grad = 0,
sim_score = 0, value = 0, criterion = "x < 15"), row.names = 1L, class = "data.frame"),
right = structure(list(y = 7, x = 20, grad = -0.5, sim_score = 0.25,
value = -0.5, criterion = "x >= 15"), row.names = 2L, class = "data.frame"),
root = list(root = structure(list(y = c(-10, 7), x = c(10,
20), grad = c(0, -0.5), sim_score = c(0.125, 0.125), value = c(-0.25,
-0.25), criterion = c("x < 22.5", "x < 22.5")), row.names = 1:2, class = "data.frame"),
gain = 0.125)), right = list(left = structure(list(y = 8,
x = 25, grad = 0.5, sim_score = 0.25, value = 0.5, criterion = "x < 30"), row.names = 3L, class = "data.frame"),
right = structure(list(y = -7, x = 35, grad = 0, sim_score = 0,
value = 0, criterion = "x >= 30"), row.names = 4L, class = "data.frame"),
root = list(root = structure(list(y = c(8, -7), x = c(25,
35), grad = c(0.5, 0), sim_score = c(0.125, 0.125), value = c(0.25,
0.25), criterion = c("x >= 22.5", "x >= 22.5")), row.names = 3:4, class = "data.frame"),
gain = 0.125)), root = list(root = structure(list(y = c(-10,
7, 8, -7), x = c(10, 20, 25, 35), grad = c(0, -0.5, 0.5, 0),
sim_score = c(0, 0, 0, 0), value = c(0, 0, 0, 0)), row.names = c(NA,
-4L), class = "data.frame"), gain = 0.25))
Running the following did not give the right output
pred_tree(tree = mytree3, newdata = newdata)
A simple recursion that you can do can be:
.pred <- function(x, tree)
{
#Ensure you pass in a list and not a dataframe
if(is.data.frame(tree)) tree <- list(tree)
#Reorder the list if necessary
if(!is.data.frame(tree[[1]])) tree <- tree[c(2, 1, 3)]
# Check whether the condition is met. If so return
if (eval(parse(text=tree[[1]][["criterion"]]),list(x = x))) tree[[1]][["value"]][1]
else .pred(x, tree[[2]])
}
pred_tree <- function(tree, newdata)
{
cbind(newdata,pred = Vectorize(.pred,"x")(x= newdata$x,tree))
}
Now ou can call your function:
pred_tree(mytree,data.frame(x=c(1,10,15,18,19,22,23,25,29,30,33,35,100)))
x pred
1 1 -10.5
2 10 -10.5
3 15 6.5
4 18 6.5
5 19 6.5
6 22 6.5
7 23 7.5
8 25 7.5
9 29 7.5
10 30 -7.5
11 33 -7.5
12 35 -7.5
13 100 -7.5

Divide data by the preceding row and create new dataframe

I have a data set and I'm trying to calculate the rate of change between the rows.
My input looks like this:
foo = structure(list(date = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("10/03/2020",
"11/03/2020", "12/03/2020", "13/03/2020", "9/03/2020"), class = "factor"),
A = c(0.60256322, 0.634543306, 0.022976661, 0.009839044,
0.319456765), B = c(45.42320826, 57.32689951, 32.49487759,
29.40804164, 54.33691346), C = c(5.114123914, 3.674167652,
2.330610757, 5.510280192, 5.717950467), D = c(4.187409484,
4.835943165, 4.340614439, 4.607468576, 3.14338155)), row.names = c(NA,
5L), class = "data.frame")
I'm trying to divide each of the following cells with the one before
eg. [5,2] / [4,2]; [4,2] / [3,2]... etc
and I'm trying to create a new output like this:
bar = structure(list(date = structure(c(5L, 1L, 2L, 3L, 4L), .Label = c("10/03/2020",
"11/03/2020", "12/03/2020", "13/03/2020", "9/03/2020"), class = "factor"),
A = c(0, 1.053073412, 0.03620976, 0.428219052, 32.46827283
), B = c(0, 1.262061878, 0.56683473, 0.90500546, 1.847688946
), C = c(0, 0.718435398, 0.634323465, 2.364307371, 1.037687789
), D = c(0, 1.154877063, 0.897573501, 1.061478424, 0.682236134
)), row.names = c(NA, 5L), class = "data.frame")
I'm sure there's a better way than finding the length of the column and looping through. Can anyone point me in the right direction?
You cans use mutate_if or mutate_at from dplyr package.
library(dplyr)
foo %>%
mutate_if(!grepl("date", names(.)), function(x) x/lag(x))
OR
foo %>%
mutate_at(vars(-date), function(x) x/lag(x))
In base R, we can use head and tail to divide data.
foo[-1] <- lapply(foo[-1], function(x) c(0, tail(x, -1)/head(x, -1)))
foo
# date A B C D
#1 9/03/2020 0.00000000 0.0000000 0.0000000 0.0000000
#2 10/03/2020 1.05307341 1.2620619 0.7184354 1.1548771
#3 11/03/2020 0.03620976 0.5668347 0.6343235 0.8975735
#4 12/03/2020 0.42821905 0.9050055 2.3643074 1.0614784
#5 13/03/2020 32.46827283 1.8476889 1.0376878 0.6822361
Another tidyverse approach.
library(tidyverse)
bar <- foo %>%
mutate_if(is.double, ~ replace_na(./lag(.), replace = 0))
bar
#> date A B C D
#> 1 9/03/2020 0.00000000 0.0000000 0.0000000 0.0000000
#> 2 10/03/2020 1.05307341 1.2620619 0.7184354 1.1548771
#> 3 11/03/2020 0.03620976 0.5668347 0.6343235 0.8975735
#> 4 12/03/2020 0.42821905 0.9050055 2.3643074 1.0614784
#> 5 13/03/2020 32.46827283 1.8476889 1.0376878 0.6822361

insert a column into a tabular object

I found this blog on getting a nested table into LaTeX format(Blog Link). I like the outcome but want to insert a column into the object at the beginning after the rownames. I'm used to dealing with data frames so dealing with this beast is more difficult than typical column indexing.
Here's what I have now:
pre post
approach mean sd mean sd
1 24.17 8.310 54.33 11.01
2 25.50 9.434 65.25 16.32
3 26.33 9.139 63.17 12.53
And here's what I'd like it to look like:
pre post
approach n mean sd mean sd
1 12 24.17 8.310 54.33 11.01
2 12 25.50 9.434 65.25 16.32
3 12 26.33 9.139 63.17 12.53
Here's the dput of z and also the column of n's I'd like to insert.
Thank you in advance.
z <- structure(list(24.1666666666667, 25.5, 26.3333333333333, 8.31027111835746,
9.4339811320566, 9.13866245766587, 54.3333333333333, 65.25,
63.1666666666667, 11.0068848977136, 16.3157759685081, 12.5323822978956), .Dim = 3:4, .Dimnames = list(
NULL, c("term", "term", "term", "term")), rowLabels = structure(c("1",
"2", "3"), .Dim = c(3L, 1L), .Dimnames = list(NULL, "approach"), justification = structure(c(NA_character_,
NA_character_, NA_character_), .Dim = c(3L, 1L)), colnamejust = NA_character_, justify = NA, suppress = 0), colLabels = structure(c("pre",
"mean", NA, "sd", "post", "mean", NA, "sd"), .Dim = c(2L, 4L), justification = structure(c(NA_character_,
NA_character_, NA_character_, NA_character_, NA_character_, NA_character_,
NA_character_, NA_character_), .Dim = c(2L, 4L)), colnamejust = character(0), justify = NA, suppress = 0), table = value *
v * approach ~ variable2 * result_variable, formats = structure(c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = 3:4, .Dimnames = list(
NULL, c("format", "format", "format", "format"))), justification = structure(c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA), .Dim = 3:4, .Dimnames = list(
NULL, c("justification", "justification", "justification",
"justification"))), class = "tabular")
structure(c(12L, 12L, 12L), .Names = c("1", "2", "3"))
The only (known ?) way is to re-assign re-ordered:
R> mockup <- data.frame(B=21:23, C=31:33)
R> mockup
B C
1 21 31
2 22 32
3 23 33
R>
Now add column A:
R> mockup[,"A"] <- 1:3
R> mockup
B C A
1 21 31 1
2 22 32 2
3 23 33 3
R>
And reorder:
R> mockup <- mockup[,c("A", "B", "C")]
R> mockup
A B C
1 1 21 31
2 2 22 32
3 3 23 33
R>
Presto. New column at the beginning.
Something like this:
z <- data.frame(approach = gl(3, 12), pre = rnorm(36)*50, post = rnorm(36)*60)
library(tables)
tabular(approach ~ (pre + post) * (mean + sd))
pre post
approach mean sd mean sd
1 -5.431 61.01 3.766 54.76
2 20.408 29.14 -9.261 54.58
3 -7.854 53.55 -30.046 62.41
tabular(approach ~ (n=1) + (pre + post) * (mean + sd))
pre post
approach n mean sd mean sd
1 12 -5.431 61.01 3.766 54.76
2 12 20.408 29.14 -9.261 54.58
3 12 -7.854 53.55 -30.046 62.41
tabular(approach + 1 ~ (n=1) + (pre + post) * (mean + sd))
pre post
approach n mean sd mean sd
1 12 -5.431 61.01 3.766 54.76
2 12 20.408 29.14 -9.261 54.58
3 12 -7.854 53.55 -30.046 62.41
All 36 2.374 50.06 -11.847 57.46
For more details see vignette of the tables package.

Resources