I have two incomplete dataframes (df_a, df_b): Columns are missing or NA values. "by" is the merge index and df_a has "priority" over df_b.
df_a = structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400, 1635174000), class = c("POSIXct", "POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513), Export = c("10.912", "10.47", NA, NA), color = c("rgb(0,128,0)", "rgb(0,128,0)", NA, NA), Status = c("ok", "ok", NA, NA), Plausibilität = c("4", "4", NA, NA), min = c(7.93000000000001, 9.4, 8.7, 8.3), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625)), row.names = c(NA, -4L), class = "data.frame")
df_b = structure(list(Datum = structure(c(1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct", "POSIXt")), Vorhersage = c(14.821988, 14.832919, 14.706179, 14.573527), Referenz = c(16.6, 16.2, 15.9, 16), DWD_Name = c("Elpersbüttel", "Elpersbüttel", "Elpersbüttel", "Elpersbüttel"), Export = c(17.198, 16.713, 16.378, 16.358), color = c("rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)"), Status = c("ok", "ok", "ok", "ok"), Plausibilität = c(4, 4, 4, 4), min = c(13.05, 12.808, 11.631891, 12.312), max = c(17, 17, 16.9, 16.7)), row.names = c(NA, -4L), class = "data.frame")
desired output is:
Datum Vorhersage Export color Status Plausibilität min max Referenz
1 2021-10-25 14:00:00 10.3 10.912 rgb(0,128,0) ok 4 7.9 12 NA
2 2021-10-25 15:00:00 10.2 10.47 rgb(0,128,0) ok 4 9.4 12 NA
3 2021-10-25 16:00:00 10.0 <NA> <NA> <NA> <NA> 8.7 13 NA
4 2021-10-25 17:00:00 9.7 <NA> <NA> <NA> <NA> 8.3 12 NA
5 2021-09-24 21:00:00 14.8 17.198 rgb(0,128,0) ok 4 13.1 17 17
6 2021-09-24 22:00:00 14.8 16.713 rgb(0,128,0) ok 4 12.8 17 16
7 2021-09-24 23:00:00 14.7 16.378 rgb(0,128,0) ok 4 11.6 17 16
8 2021-09-25 00:00:00 14.6 16.358 rgb(0,128,0) ok 4 12.3 17 16
DWD_Name
1 <NA>
2 <NA>
3 <NA>
4 <NA>
5 Elpersbüttel
6 Elpersbüttel
7 Elpersbüttel
8 Elpersbüttel
# for rebuild:
structure(list(Datum = structure(c(1635163200, 1635166800, 1635170400,
1635174000, 1632510000, 1632513600, 1632517200, 1632520800), class = c("POSIXct",
"POSIXt")), Vorhersage = c(10.297922, 10.155121, 10.044135, 9.699513,
14.821988, 14.832919, 14.706179, 14.573527), Export = c("10.912",
"10.47", NA, NA, "17.198", "16.713", "16.378", "16.358"), color = c("rgb(0,128,0)",
"rgb(0,128,0)", NA, NA, "rgb(0,128,0)", "rgb(0,128,0)", "rgb(0,128,0)",
"rgb(0,128,0)"), Status = c("ok", "ok", NA, NA, "ok", "ok", "ok",
"ok"), Plausibilität = c("4", "4", NA, NA, "4", "4", "4", "4"
), min = c(7.93000000000001, 9.4, 8.7, 8.3, 13.05, 12.808, 11.631891,
12.312), max = c(12.31715325, 12.42822725, 12.51326325, 12.28620625,
17, 17, 16.9, 16.7), Referenz = c(NA, NA, NA, NA, 16.6, 16.2,
15.9, 16), DWD_Name = c(NA, NA, NA, NA, "Elpersbüttel", "Elpersbüttel",
"Elpersbüttel", "Elpersbüttel")), row.names = c(NA, -8L), class = "data.frame")
Thanks to the help of #r2evans I tried the following:
by = "Datum"
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), by)
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = by, all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
but I get the following error:
Error in fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]) :
Item 2 is type double but the first item is type character. Please coerce
Most of the other answers are good, but many either over-complicate the result (in my opinion) or they perform a left or right join, not the full join as expected in the OP.
Here's a quick solution that uses dynamic column names.
library(data.table)
colnms <- setdiff(intersect(names(df_a), names(df_b)), "by")
colnms
# [1] "a"
setDT(df_a)
setDT(df_b)
merge(df_a, df_b, by = "by", all = TRUE
)[, (colnms) := lapply(colnms, function(nm) fcoalesce(.SD[[paste0(nm, ".x")]], .SD[[paste0(nm, ".y")]]))
][, c(outer(colnms, c(".x", ".y"), paste0)) := NULL ][]
# by b c a
# <num> <num> <num> <num>
# 1: 1 1 NA 1
# 2: 2 NA 2 2
# 3: 3 3 3 3
# 4: 4 NA 4 4
Notes:
the normal data.table::[ merge is a left-join only, so we need to use data.table::merge in order to be able to get a full-join with all=TRUE;
because it's using merge, the repeated columns get the .x and .y suffixes, something we can easily capitalize on;
the canonical and most-performant way when using (colnms) := ... is to also include .SDcols=colnms, but that won't work as well here since we need the suffixed columns, not the colnms columns themselves; this is a slight performance penalty but certainly not an anti-pattern (I believe) given what we need to do; and since we could have more than one duplicate column, we have to be careful to do it with each pair at a time, not all of them at once;
the last [-block (using outer) is for removing the duplicate columns; without it, the output would have column names c("by", "a.x", "b", "a.y", "c", "a"). It uses outer because that's a straight-forward way to get 1-or-more colnms and combine .x and .y to each of them; it then uses data.table's := NULL shortcut for removing one-or-more columns.
This isn't the most elegant, but you can make a function that applies your rule to coalesce the values if they occur in both data frames.
# find the unique column names (not called "by")
cols <- union(names(df_a),names(df_b))
cols <- cols[!(cols == "by")]
# merge the data sets
df_merge <- merge(df_a, df_b, by = "by", all = TRUE)
# function to check for the base column names that now have a '.x' and
# a '.y' version. for the columns, fill in the NAs from '.x' with the
# value from '.y'
col_val <- function(col_base, df) {
x <- names(df)
if (all(paste0(col_base, c(".x", ".y")) %in% x)) {
na.x <- is.na(df[[paste0(col_base, ".x")]])
df[[paste0(col_base, ".x")]][na.x] <- df[[paste0(col_base, ".y")]][na.x]
df[[paste0(col_base, ".x")]]
} else {
df[[col_base]]
}
}
# apply this function to every column
cbind(df_merge["by"], sapply(cols, col_val, df = df_merge))
This will give the following result.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
I know you specified base, by the natural_join() function is worth mentioning.
library(rqdatatable)
natural_join(df_a, df_b, by = "by", jointype = "FULL")
This gives exactly what you want.
by a b c
1 1 1 1 NA
2 2 2 NA 2
3 3 3 3 3
4 4 4 NA 4
Not the answer with R base. But one possible solution with the package data.table
library(data.table)
setDT(df_a)
setDT(df_b)
df_a <- rbind(df_a, list(4, NA, NA))
df_b <- rbind(list(1, NA, NA), df_b)
df_a[df_b, `:=` (a = fifelse(is.na(a), i.a, a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Edit with the help of #r2evans, A much more elegant and efficient solution:
df_a[df_b, `:=` (a = fcoalesce(a, i.a), c = c), on = .(by)][]
#> by a b c
#> 1: 1 1 1 NA
#> 2: 2 2 NA 2
#> 3: 3 3 3 3
#> 4: 4 4 NA 4
Created on 2021-10-19 by the reprex package (v2.0.1)
here a dynamic solution.. not bad, but maybe someone knows how to speed it up.
get_complete_df<-function(df_a,df_b, by = "by"){
df_a = unique(df_a)
df_b = unique(df_b)
nam_a = names(df_a)[!(names(df_a) == by)]
nam_b = names(df_b)[!(names(df_b) == by)]
nums_a = unlist(lapply(df_a, is.numeric))
nums_b = unlist(lapply(df_b, is.numeric))
nums = unique(names(df_a)[nums_a],names(df_b)[nums_b])
## try to supplement NAs
x = df_b[[by]][df_b[[by]] %in% df_a[[by]]]
y = nam_b[nam_b %in% nam_a]
vna = is.na(df_a[df_a[,1] %in% x,y])
df_a[df_a[,1] %in% x ,y][vna] = df_b[df_b[,1] %in% x,y][vna]
## get complete df
all_names = c(nam_a,nam_b )
all_names = c(by, unique(all_names))
all_by = na.omit(unique(c(df_a[[by]],df_b[[by]]) ))
## build
df_o = as.data.frame(matrix(nrow = length(all_by),ncol = length(all_names)))
names(df_o) = all_names
df_o[[by]] = all_by
## fill in content
df_o[df_o[,1] %in% df_b[,1],names(df_b)] = df_b
df_o[df_o[,1] %in% df_a[,1],names(df_a)] = df_a ## df_a has priority!
# fix numeric:
# why did some(!) num fields changed to chr ?
df_o[,nums] = as.data.frame(apply(df_o[,nums], 2, as.numeric))
df_o
}
The function I created runs correctly on my computer. However, I'm trying to check for errors by devtools :: check_built () and I'm having problems with the myeq argument. The letters d and h represent my columns simple1 $ Diameter (cm) and simple1 $ Height (m), respectively.
indvol(x = simple1, mens="plot", myeq = 0.000065661*d^(2.475293)*h^(0.300022))
> head(simple1)
Plot Individual Specie Height (m) Diameter (cm)
1 1 1 Cariocar braziliense 7.5 22.60
2 1 1 Cariocar braziliense 7.5 25.78
3 1 1 Cariocar braziliense 7.5 46.15
4 1 1 Cariocar braziliense 7.5 9.55
5 1 2 Qualya parvifora 2.0 5.73
6 1 3 Magonia pubescens 4.0 5.73
The following error appears:
Error in indvol(x = simple1, mens = "plot", myeq = 6.5661e-05 * d^(2.475293) * :
object 'd' not found
Execution halted
The part dealing with the myeq argument within the function is:
if(mens=="plot"){
colnames(x)[5]<-"d"
colnames(x)[4]<-"h"
d<-x[,5]
h<-x[,4]
}
x$`Volume (m3)` <- eval(substitute(myeq), envir=x); x
Would anyone know how to fix this problem?
You have to capture the expression without evaluating it. You can do this with match.call():
indvol <- function(x, mens, myeq)
{
mc <- as.list(match.call()[-1])
if (mens == "plot")
{
colnames(x)[5] <- "d"
colnames(x)[4] <- "h"
d <- x[, 5]
h <- x[, 4]
}
x$`Volume (m3)` <- eval(mc$myeq, envir = x)
return(x)
}
So the function now works as expected:
indvol(x = simple1, mens="plot", myeq = 0.000065661*d^(2.475293)*h^(0.300022))
#> Plot Individual Specie h d Volume (m3)
#> 1 1 1 Cariocar braziliense 7.5 22.60 0.270184396
#> 2 1 1 Cariocar braziliense 7.5 25.78 0.374268984
#> 3 1 1 Carioca braziliense 7.5 46.15 1.581822308
#> 4 1 1 Carioca braziliense 7.5 9.55 0.032036171
#> 5 1 2 Qualya parvifora 2.0 5.73 0.006085243
#> 6 1 3 Magoni pubescens 4.0 5.73 0.007491927
As a design point though, it might be better to allow users to just use the column names of the passed data frame into the function.
Data
simple1 <- structure(list(Plot = c(1L, 1L, 1L, 1L, 1L, 1L), Individual = c(1L,
1L, 1L, 1L, 2L, 3L), Specie = c("Cariocar braziliense", "Cariocar braziliense",
"Carioca braziliense", "Carioca braziliense", "Qualya parvifora",
"Magoni pubescens"), `Height (m)` = c(7.5, 7.5, 7.5, 7.5, 2, 4),
`Diameter (cm)` = c(22.6, 25.78, 46.15, 9.55, 5.73, 5.73)),
class = "data.frame", row.names = c("1", "2", "3", "4", "5", "6"))
I'm fairly new to coding languages and have been asked to create a new dataframe based on two existing dataframes. Dataframe 1 is the original and dataframe 2 is a subset of the original. The new data frame needs to be a copy of the original with certain scores removed if they meet certain conditions from
df2, ie identify the task type in df2 and remove the corresponding value from df1, if the sample ID matches.
E.g.Dataframe1:
sample_id Low Mid High
13420 NA 2.4 3.7
43905 7.5 NA NA
52078 5.6 3.2 5.6
43292 10 NA 1.9
79327 5.7 3.2 NA
Dataframe2:
Sample Task type
13420 High
52078 Low
52078 Mid
43292 High
79327 Low
New dataframe:
13420 NA 2.4 NA
43905 7.5 NA NA
52078 NA NA 5.6
43292 10 NA NA
79327 NA 3.2 NA
Can anyone help, please? I've tried a few conditional statements, but have had no luck.
sample data
df1 <- data.table::fread("sample_id Low Mid High
13420 NA 2.4 3.7
43905 7.5 NA NA
52078 5.6 3.2 5.6
43292 10 NA 1.9
79327 5.7 3.2 NA")
df2 <- data.table::fread("Sample Tasktype
13420 High
52078 Low
52078 Mid
43292 High
79327 Low")
code
library( data.table )
#or make data.frames a data.table
data.table::setDT(df1);data.table::setDT(df2)
#melt df1 to long format
df1.melt <- melt( df1, id.vars = "sample_id" )
#update join the molten dataset with df2, updating the value with NA
df1.melt[ df2, value := NA, on = .(sample_id = Sample, variable = Tasktype) ]
#and cast df1 wit the new values back to wide format
dcast( df1.melt, sample_id ~ variable, value.var = "value" )
output
# sample_id Low Mid High
# 1: 13420 NA 2.4 NA
# 2: 43292 10.0 NA NA
# 3: 43905 7.5 NA NA
# 4: 52078 NA NA 5.6
# 5: 79327 NA 3.2 NA
Here is an approach with dplyr and tidyr:
library(dplyr)
library(tidyr)
df1 %>%
pivot_longer(-sample_id) %>%
left_join(df2, by = c("sample_id" = "Sample",
"name" = "Task.type"),
keep = TRUE) %>%
mutate(value = ifelse(!is.na(Task.type) & Task.type == name,
NA_real_, value)) %>%
dplyr::select(-c(Sample,Task.type)) %>%
pivot_wider(id_cols = c("sample_id"))
## A tibble: 5 x 4
# sample_id Low Mid High
# <int> <dbl> <dbl> <dbl>
#1 13420 NA 2.4 NA
#2 43905 7.5 NA NA
#3 52078 NA NA 5.6
#4 43292 10 NA NA
#5 79327 NA 3.2 NA
Sample Data
df1<-structure(list(sample_id = c(13420L, 43905L, 52078L, 43292L,
79327L), Low = c(NA, 7.5, 5.6, 10, 5.7), Mid = c(2.4, NA, 3.2,
NA, 3.2), High = c(3.7, NA, 5.6, 1.9, NA)), class = "data.frame", row.names = c(NA,
-5L))
df2 <- structure(list(Sample = c(13420L, 52078L, 52078L, 43292L, 79327L
), Task.type = structure(c(1L, 2L, 3L, 1L, 2L), .Label = c("High",
"Low", "Mid"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))
I have a list of dataframes. Each dataframe is a Stock quote whose row names are dates and column names are buy price, sell price, shares and PL.
I want to obtain a column that contains the percentage of every positive PL contribution to the total daily PL.
Making it simplier. I have the following Data:
mylist= structure(list(`1` = structure(list(ID = c(35L, '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(100, 200, 300, 400)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `2` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(500, -600, 700, 800)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `3` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04'), Income = c(100, 200, 300)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L), class = "data.frame")))
Which looks like this:
$`1`
Date Income
1 2009-01-01 100
2 2009-01-03 200
3 2009-01-04 300
4 2009-01-05 400
$`2`
Date Income
1 2009-01-02 500
2 2009-01-03 -600
3 2009-01-04 700
4 2009-01-05 800
$`3`
Date Income
1 2009-01-02 100
2 2009-01-03 200
3 2009-01-04 300
I want to obtain something that looks like this:
$`1`
Date Income Perc
1 2009-01-03 100 1.00
2 2009-01-03 200 0.20
3 2009-01-04 300 0.23
4 2009-01-05 400 0.33
$`2`
Date Income Perc
1 2009-01-02 500 0.83
2 2009-01-03 600 -1.50
3 2009-01-04 700 0.54
4 2009-01-05 800 0.67
$`3`
Date Income Perc
1 2009-01-02 100 0.17
2 2009-01-03 200 0.20
3 2009-01-04 300 0.23
I have two solutions for your problem. I highly recommend combining your data frame in one master data frame in order to reduce the complexity of the code if at all possible. I am sure there are better solutions to the "Separate Data Frame" problem, but most of them will involve multiple loops and thus negatively impact performance.
Data
mylist= structure(list(`1` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(100, 200, 300, 400)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `2` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04', '2009-01-05'), Income = c(500, -600, 700, 800)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L, 4L), class = "data.frame"), `3` = structure(list(ID = c('2009-01-02', '2009-01-03', '2009-01-04'), Income = c(100, 200, 300)), .Names = c("Date", "Income"), row.names = c(1L, 2L, 3L), class = "data.frame")))
Combined Data Frame
library(dplyr)
# add an ID to each data frame
for(i in 1:length(mylist)){
mylist[[i]] <- cbind(mylist[[i]], stock_id = names(mylist)[i])
}
# create data frame with all observations
my_data_frame <- do.call(rbind, mylist)
rownames(my_data_frame) <- NULL
my_data_frame %>%
group_by(Date) %>%
mutate(Perc = Income/sum(Income[Income > 0]))
# A tibble: 11 x 4
# Groups: Date [4]
Date Income stock_id Perc
<chr> <dbl> <chr> <dbl>
1 2009-01-02 100 1 0.143
2 2009-01-03 200 1 0.5
3 2009-01-04 300 1 0.231
4 2009-01-05 400 1 0.333
5 2009-01-02 500 2 0.714
6 2009-01-03 -600 2 -1.5
7 2009-01-04 700 2 0.538
8 2009-01-05 800 2 0.667
9 2009-01-02 100 3 0.143
10 2009-01-03 200 3 0.5
11 2009-01-04 300 3 0.231
Separate Data Frames
library(dplyr)
all_dates <- unique(unlist(lapply(mylist, function(x) unique(x$Date))))
for(i in 1:length(mylist)){
mylist[[i]] <- cbind(mylist[[i]], stock_id = names(mylist)[i])
}
perc_all <- list()
for(i in 1:length(all_dates)){
temporary <- lapply(mylist, function(x) x[x$Date == all_dates[i], ])
all_obs_date <- do.call(rbind, temporary)
all_obs_date$Perc <- all_obs_date$Income/sum(all_obs_date$Income[all_obs_date$Income > 0])
perc_all[[i]] <- all_obs_date
}
perc_final <- do.call(rbind, perc_all)
lapply(mylist, function(x) {
left_join(x, perc_final) %>% select(-stock_id)
})
$`1`
Date Income Perc
1 2009-01-02 100 0.1428571
2 2009-01-03 200 0.5000000
3 2009-01-04 300 0.2307692
4 2009-01-05 400 0.3333333
$`2`
Date Income Perc
1 2009-01-02 500 0.7142857
2 2009-01-03 -600 -1.5000000
3 2009-01-04 700 0.5384615
4 2009-01-05 800 0.6666667
$`3`
Date Income Perc
1 2009-01-02 100 0.1428571
2 2009-01-03 200 0.5000000
3 2009-01-04 300 0.2307692
This is related to question: How to extract rows (using Loop) from a data frame and save it in another data frame
If 'POS' of ddf lies between any 'start' and 'end' of refdf, it needs to be included in outdf which has same structure as ddf. I could manage it with 'for' loop, but can it be done without using 'for' loop?
ddf = structure(list(POS = c(23L, 48L, 5L), Freq1 = c(0.5, 0.7, 0.8
), Freq2 = c(0.45, 0.55, 0.65)), .Names = c("POS", "Freq1", "Freq2"
), class = "data.frame", row.names = c(NA, -3L))
refdf = structure(list(Start = c(1L, 25L, 60L), End = c(10L, 50L, 75L
)), .Names = c("Start", "End"), class = "data.frame", row.names = c(NA,
-3L))
ddf
# POS Freq1 Freq2
#1 23 0.5 0.45
#2 48 0.7 0.55
#3 5 0.8 0.65
refdf
# Start End
#1 1 10
#2 25 50
#3 60 75
outdf = data.frame(POS=numeric(), Freq1=numeric(), Freq2=numeric())
for(i in 1:nrow(ddf)) for(j in 1:nrow(refdf)){
if(ddf[i,1]>refdf[j,1] && ddf[i,1]<refdf[j,2])
{outdf[nrow(outdf)+1,] = ddf[i,]; next}
}
outdf
# POS Freq1 Freq2
#2 48 0.7 0.55
#3 5 0.8 0.65
I tried following but it does not work:
apply(ddf,1,function(x){print(x);ifelse(x[1]>refdf$Start & x[1]<refdf$End, x,"")})
This isn't particularly efficient in space for large problems, but it doesn't use for:
ddf[ddf$POS %in% unlist(apply(refdf, 1, function(x) seq(x[1],x[2]))),]
## POS Freq1 Freq2
## 2 48 0.7 0.55
## 3 5 0.8 0.65
All the allowed values of POS are computed by the unlist(apply) expression. This of course assumes that POS contains only integral values.
Here is a way. It doesn't necessitate integer values, but it's also not going to be especially efficient:
pow <- cbind(expand.grid(ddf$POS, refdf$Start), Var3=expand.grid(ddf$POS, refdf$End)$Var2)
boom <- pow[which(pow$Var1 > pow$Var2 & pow$Var1 < pow$Var3), 'Var1']
ddf[ddf$POS %in% boom, ]
# POS Freq1 Freq2
#2 48 0.7 0.55
#3 5 0.8 0.65