Replacing certain values in data.frame in R - r

I am trying to replace the NAs in "test" with the forecast values in "forecast". I am trying to use match, but I can't figure it out. keep in mind the id and time create a two-part unique id. Any suggestions? ( keep in mind my data set is much larger than this example (rows=32000))
test = data.frame(id =c(1,1,1,2,2,2), time=c(89,99,109,89,99,109), data=c(3,4,NA,5,2,NA))
forecast = data.frame(id =c(1,2), time=c(109,109), data=c(5,1))
Desired output
out = data.frame(id =c(1,1,1,2,2,2), time=c(89,99,109,89,99,109), data=c(3,4,5,5,2,1))

Here is the data.table solution
test_dt <- data.table(test, key = c('id', 'time'))
forecast_dt <- data.table(test, key = c('id', 'time'))
forecast[test][,data := ifelse(is.na(data), data.1, data)]
EDIT. Benchmarking Tests: Data Table is ~ 3x faster even for a small dataset.
library(rbenchmark)
f_merge <- function(){
out2 <- merge(test, forecast, by = c("id", "time"), all.x = TRUE)
out2 <- transform(out2,
newdata = ifelse(is.na(data.x), data.y, data.x), data.x = NULL, data.y = NULL)
return(out2)
}
f_dtable <- function(){
test <- data.table(test, key = c('id', 'time'))
forecast <- data.table(forecast, key = c('id', 'time'))
test <- forecast[test][,data := ifelse(is.na(data), data.1, data)]
test$data.1 <- NULL
return(test)
}
benchmark(f_merge(), f_dtable(), order = 'relative',
columns = c('test', 'elapsed', 'relative'))
test elapsed relative
2 f_dtable() 0.86 1.00
1 f_merge() 2.26 2.63

I would use merge to join the data together and then compute your new column in two steps:
out2 <- merge(test, forecast, by = c("id", "time"), all.x = TRUE)
> out2
id time data.x data.y
1 1 89 3 NA
2 1 99 4 NA
3 1 109 NA 5
4 2 89 5 NA
5 2 99 2 NA
6 2 109 NA 1
#Compute new variable and clean up old ones:
out2 <- transform(out2, newdata = ifelse(is.na(data.x), data.y, data.x), data.x = NULL, data.y = NULL)
> out2
id time newdata
1 1 89 3
2 1 99 4
3 1 109 5
4 2 89 5
5 2 99 2
6 2 109 1

Try this:
test$data[is.na(test$data)] <- forecast[((forecast$id %in% test$id) & (forecast$time %in% test$time)),]$data

Related

Parameter to check if all values have joined when joining two dataframes

I often have two data frames that I wish to join, where I expect all values to join. If not all values are present in both data frames, I want it to return an error.
Here is a MWE:
library(dplyr, warn.conflicts = FALSE)
df1 <- data.frame(
id = c(1:5),
value1 = rep(1, 5)
)
print(df1)
#> id value1
#> 1 1 1
#> 2 2 1
#> 3 3 1
#> 4 4 1
#> 5 5 1
df2 <- data.frame(
id = c(1:4),
value2 = rep(2, 4)
)
print(df2)
#> id value2
#> 1 1 2
#> 2 2 2
#> 3 3 2
#> 4 4 2
df3 <- inner_join(
df1,
df2,
by = "id")
print(df3)
#> id value1 value2
#> 1 1 1 2
#> 2 2 1 2
#> 3 3 1 2
#> 4 4 1 2
# Check if all values have joined
stopifnot(
nrow(df3) == max(nrow(df1), nrow(df2))
)
#> Error: nrow(df3) == max(nrow(df1), nrow(df2)) is not TRUE
Created on 2021-03-31 by the reprex package (v1.0.0)
This works, but I do not like the stopifnot(). It feels cumbersome, and particularly if I wish to overwrite df2, then I need to create a temp value df2_previous_row_num = nrow(df2) and then do stopifnot(nrow(df2) == df2_previous_row_num).
Also the nrow() test only works if all values in id are unique. There are other methods, e.g. stopifnot(c(df1$id %in% df3$id, df2$id %in% df3$id)) but again these are ugly.
Really what I am looking for is a parameter that makes the join fail if some values do not join. Something like, inner_join(df1, df2, fail_if_not_all_present = TRUE).
I am not attached to the tidyverse - if there is a base R or data.table way of doing this then I would consider those.
Does anyone know anything?
You can try writing a custom inner join function.
custom_inner_join <- function(data1,data2,by, fail_if_not_all_present = FALSE) {
if(fail_if_not_all_present) {
vals1 <- do.call(paste, data1[cols])
vals2 <- do.call(paste, data2[cols])
if(all(vals1 %in% vals2) && all(vals2 %in% vals1)) {
merge(data1, data2, by)
} else stop('Not all key values are present')
} else {
merge(data1, data2, by)
}
}
custom_inner_join(df1, df2, 'id')
# id value1 value2
#1 1 1 2
#2 2 1 2
#3 3 1 2
#4 4 1 2
custom_inner_join(df1, df2, 'id', fail_if_not_all_present = TRUE)
Error in custom_inner_join(df1, df2, "id", fail_if_not_all_present = TRUE) :
Not all key values are present

how to change column name in several data frame in the same time?

I have five data frames which have same number of columns. I want to use rbind to append my data, but they have different variable names. Fortunately, it has same form like this.
date prod1 code1 tot1
date prod2 code2 tot2
...
date prod5 code5 tot5
I want to delete the number-code at the same time, so then I can rbind my data frames. How can I do this?
Thanks in advance.
Since the questions was how to change the column names, I will address this problem first:
lapply(dflist, setNames, nm = new_col_name)
df1 <- data.frame(prod1 = 1:5, code1 = 1:5, tot1 = 1:5)
df2 <- data.frame(prod2 = 1:5, code2 = 1:5, tot2 = 1:5)
dflist <- list(df1, df2)
lapply(dflist, setNames, nm = c("prod", "code", "tot"))
[[1]]
prod code tot
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
[[2]]
prod code tot
1 1 1 1
2 2 2 2
3 3 3 3
4 4 4 4
5 5 5 5
As already mentioned it may be better just to ignore column names and use rbindlist from data.table to bind rows.
data.table::rbindlist(dflist, use.names = F)
You can do it using magrittr and dplyr :
d1 <- mtcars
d2 <- d1
d3 <- d1
names(d2) <- paste0(names(d2), "_2")
names(d3) <- paste0(names(d2), "_3")
rbind(d1, d2, d3) # gives an error, ok
#> Error in match.names(clabs, names(xi)): les noms ne correspondent pas aux noms précédents
library(magrittr, quietly = TRUE, warn.conflicts = FALSE)
library(dplyr, quietly = TRUE, warn.conflicts = FALSE)
df_list <- list(d2, d3)
df_list <- lapply(df_list, magrittr::set_colnames, names(d1))
df_final <- rbind(d1, dplyr::bind_rows(df_list) )
nrow(df_final) == 3* nrow(d1)
#> [1] TRUE

Factor levels reaching certain values

I need to find out how many factor levels reach values of a continuous variable.
The code below produces the desired result for the example data, but it is rather an awkward work around.
My real dataframe is much larger and the real plot should show more values (or is continuous) on the x-axis. I would appreciate an applicable code a lot.
set.seed(5)
df <- data.frame(ID = factor(c("a","a","b","c","d","e","e")),values = runif(7,0,6))
seq <- 1:5
length.unique <- function(x) length(unique(x))
sub1 <- df[which(df$values >= 1), ]
sub2 <- df[which(df$values >= 2), ]
sub3 <- df[which(df$values >= 3), ]
sub4 <- df[which(df$values >= 4), ]
sub5 <- df[which(df$values >= 5), ]
N_IDs <- c(length.unique(sub1$ID),length.unique(sub2$ID),length.unique(sub3$ID),length.unique(sub4$ID),length.unique(sub5$ID))
plot(N_IDs ~ seq, type="b")
Using tidyverse, you can save some time by first calculating the max value for each ID,
library(tidyverse)
idmax <- df %>% group_by(ID) %>% summarize(max=max(values)) %>% pull(max)
Then for each cut point, return the count that pass
map_df(1:5, ~data.frame(cut=., count=sum(idmax >.)))
# cut count
# 1 1 4
# 2 2 3
# 3 3 3
# 4 4 3
# 5 5 1
Using non-equi joins:
library(data.table)
setDT(df)
df[.(seq = 1:5), on = .(values >= seq), allow = T, .(N_IDs = uniqueN(ID)), by = .EACHI]
# values N_IDs
#1: 1 4
#2: 2 3
#3: 3 3
#4: 4 3
#5: 5 1

how to compare and tabulate frequency of common elements between two vectors in r

I have two vectors having common and repetitive elements. I want a table comparing the frequency of common elements in both vectors. Here is subset
plyr::count(V1)
x freq
1 A*02:01 106
2 A*02:02 88
3 A*03:01 95
4 A*03:02 60
plyr::count(V2)
x freq
1 A*02:01 11
2 A*02:02 11
3 A*02:04 1
4 A*03:01 20
The Output I want is:
x freq.V1 freq.V2
1 A*02:01 106 11
2 A*02:02 88 11
3 A*03:01 60 20
I think merge seems a good choice here as the default is to keep observations common to both datasets. So the following should work
merge(plyr::count(V1), plyr::count(V2), by="x")
Worked example
plyr::count(mtcars$gear)
# x freq
# 1 3 15
# 2 4 12
# 3 5 5
plyr::count(mtcars$gear[1:10])
# x freq
# 1 3 4
# 2 4 6
merge(
plyr::count(mtcars$gear),
plyr::count(mtcars$gear[1:10]),
by="x")
# x freq.x freq.y
# 1 3 15 4
# 2 4 12 6
Just use table:
tbl1 <- table(V1[V1 %in% (int <- intersect(unique(V1), unique(V2)))])
tbl2 <- table(V2[V2 %in% int])
data.frame(x = names(tbl1), freq.V1 = as.vector(tbl1), freq.V2 = as.vector(tbl2))
Or my favorite, data.table:
library(data.table)
DT <- data.table(V1 = V1, V2 = V2)
DT[V1 %in% unique(V2), .(freq.V1 = .N), by = .(x = V1)
][DT[V2 %in% unique(V1), .N, by = .(x = V2)],
freq.V2 := i.N, on = "x", nomatch = 0L]
Of course both options look much simpler if you know beforehand that V1 and V2 consist of the same set of elements:
data.frame(x = names(tbl1 <- table(V1)), freq.V1 = as.vector(tbl1),
freq.V2 = as.vector(table(V2)))
and
DT[ , .(freq.V1 = .N), by = .(x = V1)
][DT[ , .(freq.V2 = .N), by = .(x = V2)], on = "x"]

How to calculate count by group, then keep only one per group

Say that I have this data.frame, data:
data <- data.frame(val=c(rep(6,10), rep(7, 15), rep(8, 20), rep(9, 25), rep(10, 100), rep(11, 20), rep(12, 15), rep(13, 10)))
data$plus <- data$val + 100
My goal is to create a new data.frame that has the frequencies of each val, and the associated plus value.
My current strategy is to create a table (called table), then merge the frequencies. Then to keep only the first observation within each group:
table <- table(data$val)
df1 <- data.frame(val = as.integer(names(table)[1:length(table)]), N = table[1:length(table)])
df2 <- merge(data, df1)
df3 <- do.call(rbind, by(df2, list(df2$val), FUN=function(x) head(x, 1)))
This works, but it seems clunky.
In Stata, for example, it would be less and simpler code. Something like:
bys val plus: egen max = _N
bys val plus: gen first = _n==1
keep if first==1
Is there a way to simplify or make more elegant the R code?
Here's an approach using "data.table":
library(data.table)
as.data.table(data)[, N := .N, by = val][, .SD[1], by = val]
# val plus N
# 1: 6 106 10
# 2: 7 107 15
# 3: 8 108 20
# 4: 9 109 25
# 5: 10 110 100
# 6: 11 111 20
# 7: 12 112 15
# 8: 13 113 10
## Or (#RicardoSaporta)
as.data.table(data)[, list(.N, plus=plus[1]), by = val]
## Or (#DavidArenburg)
unique(as.data.table(data)[, N := .N, by = val], by = "val")
With "dplyr", you can try:
library(dplyr)
data %>%
group_by(val) %>%
mutate(N = n()) %>%
slice(1)
In base R, I guess you can try something like:
do.call(rbind, lapply(split(data, data$val),
function(x) cbind(x, N = nrow(x))[1, ]))
Edited
Or you can use aggregate()
data$N = 0
out = aggregate(N ~ val + plus, data = data, length)
or else
out = aggregate(plus ~val, data = data,function(x) c(unique(x), N = length(x)))
do.call(data.frame, out)
or using ddply
library(plyr)
out = ddply(data, .(val,plus), summarize, N = length(plus))
#> out
# val plus N
#1 6 106 10
#2 7 107 15
#3 8 108 20
#4 9 109 25
#5 10 110 100
#6 11 111 20
#7 12 112 15
#8 13 113 10

Resources