Related
I need to get the common columns of a data frame list separated in different data frames. Please look at the following example:
df1 <- data.frame(Dates = c('01-01-2020','02-01-2020','03-01-2020'), col1 = c(1,2,3), col2 = c(3,2,1))
df2 <- data.frame(Dates = c('01-01-2020','02-01-2020','03-01-2020'), col1 = c(4,5,6), col2 = c(6,5,4))
df3 <- data.frame(Dates = c('01-01-2020','02-01-2020'), col1 = c(7,8), col2 = c(8,7))
ldf <- list(df1, df2, df3)
The desired output would be the following two data frames:
df_col1:
Date df1 df2 df3
01-01-2020 1 4 7
02-01-2020 2 5 8
03-01-2020 3 6 NA
df_col2:
Date df1 df2 df3
01-01-2020 3 6 8
02-01-2020 2 5 7
03-01-2020 1 4 NA
Of course, ldf is actually way longer, but the number of columns is fixed to 5, so the number of outputs is also fixed (4). This means I wouldn't mind if I use a block of code for each output.
I've tried several things but none seems to work. I'm using base R and hope to find a solution wihtout additional packages.
Thanks a lot for your time!
We bind the list elements with bind_rows from dplyr, then loop over the 'col' columns, along with the common 'Dates', reshape to 'wide' format with pivot_wider and rename if needed
library(dplyr)
library(purrr)
library(tidyr)
library(stringr)
newdf <- bind_rows(ldf)
out <- map(names(newdf)[-1], ~
newdf %>%
select(Dates, .x) %>%
mutate(rn = rowid(Dates)) %>%
pivot_wider(names_from =rn, values_from = !! rlang::sym(.x)) %>%
rename_at(-1, ~ str_c('df', seq_along(.))))
-output
out
#[[1]]
# A tibble: 3 x 4
# Dates df1 df2 df3
# <chr> <dbl> <dbl> <dbl>
#1 01-01-2020 1 4 7
#2 02-01-2020 2 5 8
#3 03-01-2020 3 6 NA
#[[2]]
# A tibble: 3 x 4
# Dates df1 df2 df3
# <chr> <dbl> <dbl> <dbl>
#1 01-01-2020 3 6 8
#2 02-01-2020 2 5 7
#3 03-01-2020 1 4 NA
Or using base R
newdf <- do.call(rbind, ldf)
f1 <- function(dat, colName) {
lst1 <- split(dat[[colName]], dat$Dates)
do.call(rbind, lapply(lst1, `length<-`, max(lengths(lst1))))
}
f1(newdf, 'col1')
f1(newdf, 'col2')
Another Base R option is to do:
m <- Reduce(function(x,y)merge(x, y, by='Dates', all=TRUE), ldf)
lapply(split.default(m[-1], sub("\\..*", "", names(m[-1]))), cbind, m[1])
Another wordy approach using base R:
#Code
names(ldf) <- paste0('df',1:length(ldf))
#Function
myfun <- function(x) {
y <- reshape(x,direction = 'long',
v.names='col',
idvar = 'Dates',varying = list(2:3))
return(y)
}
z <- do.call(rbind,lapply(ldf,myfun))
z$Data <- gsub("\\..*","",rownames(z))
rownames(z) <- NULL
#Reshape
z2 <- reshape(z,idvar = c('Dates','time'),timevar = 'Data')
#List
List <- split(z2,z2$time)
List
Output:
List
$`1`
Dates time col.df1 col.df2 col.df3
1 01-01-2020 1 1 4 7
2 02-01-2020 1 2 5 8
3 03-01-2020 1 3 6 NA
$`2`
Dates time col.df1 col.df2 col.df3
4 01-01-2020 2 3 6 8
5 02-01-2020 2 2 5 7
6 03-01-2020 2 1 4 NA
I have a very large dataframe (around 100 rows, 200 columns). A subset of my data looks like this:
example <- data.frame("Station" = c("012", "013", "014"), "Value1" = c(145.23453, 1.022342, 0.4432),
"Value2" = c(2.1221213, 4445.2231412, 0.3333421), "Name" = c("ABC", "SDS", "EFG"))
I would like to round all numeric variables in my table with these conditions.
if x<1, then 1 sig fig
if 1<= x < 99, then 2 sig figs
if x>= 100, then 3 sig figs
I know to do something like this for a specific column:
example$Value1 <- ifelse(example$Value1 < 1, signif(example$Value1, 1), example$Value1)
but I'm not sure what to do for a large dataframe with a mix of numeric and character values.
Just put the ifelse into an lapply. To identify numeric columns use negate is.character in an sapply. You also could Vectorize a small replacement FUNction with all your desired conditions to use in the lapply, which might be convenient. However, note #GKi's comment, that your conditions are not complete.
nums <- sapply(example, is.numeric)
FUN <- Vectorize(function(x) {
if (x < 1) x <- signif(x, 1)
if (1 <= x & x < 99) x <- signif(x, 2)
if (x >= 100) x <- signif(x, 3)
x
})
example[nums] <- lapply(example[nums], FUN)
# Station Value1 Value2 Name
# 1 012 145.0 2.1 ABC
# 2 013 1.0 4450.0 SDS
# 3 014 0.4 0.3 EFG
CODE
example %>%
pivot_longer(contains("Value")) %>%
mutate(
signf = case_when(
value < 1 ~ 1,
value >= 1 & value < 99 ~ 2,
TRUE ~ 3
),
value = map2_dbl(value, signf, ~signif(.x, .y))
) %>%
select(-signf) %>%
pivot_wider(names_from = "name", values_from = "value")
OUTPUT
# A tibble: 3 x 4
Station Name Value1 Value2
<fct> <fct> <dbl> <dbl>
1 012 ABC 145 2.1
2 013 SDS 1 4450
3 014 EFG 0.4 0.3
I'll give the answer using data.table instead of data.frame because it's better and I don't remember data.frame syntax that well anymore.
library(data.table)
example = data.table(
Station = c("012", "013", "014"),
Value1 = c(145.23453, 1.022342, 0.4432),
Value2 = c(2.1221213, 4445.2231412, 0.3333421),
Name = c("ABC", "SDS", "EFG"))
numeric_colnames = names(example)[sapply(example,is.numeric)]
for(x in numeric_colnames){
example[,(x):=ifelse(
get(x)<1,
signif(get(x),1),
ifelse(
get(x)<99,
signif(get(x),2),
signif(get(x),3)
))]
}
Result:
Station Value1 Value2 Name
1: 012 145.0 2.1 ABC
2: 013 1.0 4450.0 SDS
3: 014 0.4 0.3 EFG
PS: Don't worry about the 145.0 and 4450.0; that's a display issue, not a data issue:
> example[,as.character(Value1)]
[1] "145" "1" "0.4"
> example[,as.character(Value2)]
[1] "2.1" "4450" "0.3"
PPS: the 99 cutoff produces some strange results, e.g.,
> signif(98.9,2)
[1] 99
> signif(99.1,3)
[1] 99.1
Why not use a cutoff of 100 instead?
> signif(99.4,2)
[1] 99
> signif(99.5,2)
[1] 100
> signif(100.1,3)
[1] 100
Use applyand nested ifelse:
If you do not know in advance which columns are numeric and you want to keep the original dataframe:
example[sapply(example, is.numeric)] <- apply(example[sapply(example, is.numeric)], 2,
function(x) ifelse(x < 1, signif(x, 1),
ifelse(x >= 1 & x < 99 , signif(x, 2), signif(x, 3))))
example
Station Value1 Value2 Name
1 012 145.0 2.1 ABC
2 013 1.0 4450.0 SDS
3 014 0.4 0.3 EFG
You can use findInterval to set signif:
i <- sapply(example, is.numeric)
x <- unlist(example[,i])
example[,i] <- signif(x, findInterval(x, c(1, 99))+1)
example
# Station Value1 Value2 Name
#1 012 145.0 2.1 ABC
#2 013 1.0 4450.0 SDS
#3 014 0.4 0.3 EFG
findIntervall result from #webb (Thanks!) example given in the comment:
findInterval(c(145.23453, 1.022342, 0.4432, 2.1221213, 4445.2231412
, 0.3333421), c(1, 99))
#[1] 2 1 0 1 2 0
I want to join two tibbles by a range or a virtual column. but it seems the by - parameter just allow to handle chr oder vector(chr) of existing column names.
In my example I have a tibble d with a column value, and a tibble r with a from and a to column.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
> d
# A tibble: 26 x 1
value
<dbl>
1 1.0
2 1.2
3 1.4
4 1.6
5 1.8
6 2.0
7 2.2
8 2.4
9 2.6
10 2.8
# ... with 16 more rows
> r
# A tibble: 6 x 3
from to class
<int> <dbl> <chr>
1 1 2 A
2 2 3 B
3 3 4 C
4 4 5 D
5 5 6 E
6 6 Inf F
now I want to join the value column in d within the range of from and to in r:
d %>% inner_join(r, by = "value between from and to") # >= and <
I can't find a way to do this so decided to join the floor of value in d with the from column in r
d %>% inner_join(r, by = c("floor(value)" = "from"))
of course i can create a second column to solve that:
d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, class)
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
but isn't there a more comfortable way?
Thanks
I don't think inequality joins is implemented in dplyr yet, or it ever will (see this discussion on Join on inequality constraints), but this is a good situation to use an SQL join:
library(tibble)
library(sqldf)
as.tibble(sqldf("select d.value, r.class from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Alternatively, if you want to integrate the join into your dplyr chain, you can use fuzzyjoin::fuzzy_join:
library(dplyr)
library(fuzzyjoin)
d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"),
match_fun = list(`>=`, `<`)) %>%
select(value, class)
Result:
# A tibble: 31 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 A
7 2.0 B
8 2.2 B
9 2.4 B
10 2.6 B
# ... with 21 more rows
Notice I added single quotes around from and to since those are reserved words for the SQL language.
Ok thanks for advices, this was pretty interesting. I finally wrote a function range_join (inspired by #ycw's code) and compared all described solution in view of runtime.
I like fuzzy_join but with only 50k rows in d it needs more than 40sec. Thats too slow.
Here the result with 5k rows in d
library(dplyr)
library(fuzzyjoin)
library(sqldf)
#join by range by #WiWeber
range_join <- function(x, y, value, left, right){
x_result <- tibble()
for (y_ in split(y, 1:nrow(y)))
x_result <- x_result %>% bind_rows(x[x[[value]] >= y_[[left]] & x[[value]] < y_[[right]],] %>% cbind(y_))
return(x_result)
}
#dynamic join by #ycw
dynamic_join <- function(d, r){
d$type <- NA_character_
for (r_ in split(r, r$type))
d <- d %>% mutate(type = ifelse(value >= r_$from & value < r_$to, r_$type, type))
return(d)
}
d <- tibble(value = seq(1,6, by = 0.001), join = TRUE)
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), type = LETTERS[seq(1,6)], join = TRUE)
# #useR sqldf - fast and intuitive but extra library with horrible code
start <- Sys.time()
d2 <- tbl_df(sqldf("select d.value, r.type from d
join r on d.value >= r.'from' and
d.value < r.'to'"))
Sys.time() - start
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
start <- Sys.time()
d2 <- d %>%
fuzzy_join(r, by = c("value" = "from", "value" = "to"), match_fun = list(`>=`, `<`)) %>%
select(value, type)
Sys.time() - start
# #jonathande4 cut pretty fast
start <- Sys.time()
d2 <- d
d2$type <- cut(d$value, unique(c(r$from, r$to)), r$type, right = FALSE)
Sys.time() - start
# #WiWeber floor
start <- Sys.time()
d2 <- d %>%
mutate(join_value = floor(value)) %>%
inner_join(r, by = c("join_value" = "from")) %>%
select(value, type)
Sys.time() - start
# #WiWeber cross join - filter
start <- Sys.time()
d2 <- d %>%
inner_join(r, by = "join") %>%
filter(value >= from, value < to) %>%
select(value, type)
Sys.time() - start
# #hardik-gupta sapply
start <- Sys.time()
d2 <- d %>%
mutate(
type = unlist(sapply(value, function (x) r[which(x >= r$from & x < r$to), "type"]))
) %>%
select(value, type)
Sys.time() - start
# #ycw re-dynamic join
start <- Sys.time()
d2 <- d %>% dynamic_join(r)
Sys.time() - start
# #WiWeber range_join
start <- Sys.time()
d2 <- d %>%
range_join(r, "value", "from", "to") %>%
select(value, type)
Sys.time() - start
Results:
# #useR sqldf - fast and intuitive but extra library with horrible code
Time difference of 0.06221986 secs
# #useR fuzzy_join .... very cool but veeeeeeeeeeeeeeeery slow
Time difference of 4.765595 secs
# #jonathande4 cut pretty fast
Time difference of 0.004637003 secs
# #WiWeber floor
Time difference of 0.02223396 secs
# #WiWeber cross join - filter
Time difference of 0.0201931 secs
# #hardik-gupta sapply
Time difference of 5.166633 secs
# #ycw dynamic join
Time difference of 0.03124094 secs
# #WiWeber range_join
Time difference of 0.02691698 secs
greez WiWeber
You use the cut function to create a "class" in object d and then use a left join.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d[["class"]] <- cut(d[["value"]], c(0,2,3,4,5,6,Inf), c('A',"B", "C", "D", "E", "F"), right = FALSE)
d <- left_join(d, r)
To get the right buckets, you just need to work with the cut function to get what you want.
We can use sapply for this
library(tibble)
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
d <- cbind(d, data.frame(class = (unlist(sapply(d$value, function (x) r[which(x >= r$from & x < r$to), "class"]))) ) )
d
value class
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
11 3.0 C
12 3.2 C
13 3.4 C
14 3.6 C
15 3.8 C
16 4.0 D
17 4.2 D
18 4.4 D
19 4.6 D
20 4.8 D
21 5.0 E
22 5.2 E
23 5.4 E
24 5.6 E
25 5.8 E
26 6.0 F
We can use mutate and case_when from dplyr.
library(dplyr)
d2 <- d %>%
mutate(class = case_when(
value >= 1 & value < 2 ~ "A",
value >= 2 & value < 3 ~ "B",
value >= 3 & value < 4 ~ "C",
value >= 4 & value < 5 ~ "D",
value >= 5 & value < 6 ~ "E",
value >= 6 ~ "F"
))
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
Update
Here is a workaround by defining a function for this task.
d <- tibble(value = seq(1,6, by = 0.2))
r <- tibble(from = seq(1,6), to = c(seq(2,6),Inf), class = LETTERS[seq(1,6)])
library(dplyr)
# Define a function for dynamic join
dynamic_join <- function(d, r){
if (!("class" %in% colnames(d))){
d[["class"]] <- NA_character_
}
d <- d %>%
mutate(class = ifelse(value >= r$from & value < r$to, r$class, class))
return(d)
}
re_dynamic_join <- function(d, r){
r_list <- split(r, r$class)
for (i in 1:length(r_list)){
d <- dynamic_join(d, r_list[[i]])
}
return(d)
}
# Apply the function
d2 <- d %>% re_dynamic_join(r)
d2
# A tibble: 26 x 2
value class
<dbl> <chr>
1 1.0 A
2 1.2 A
3 1.4 A
4 1.6 A
5 1.8 A
6 2.0 B
7 2.2 B
8 2.4 B
9 2.6 B
10 2.8 B
# ... with 16 more rows
I really liked #WiWeber's range_join function, but it gives an error if a record is not within range. Here's a modification
library(dplyr)
d <- tibble(value = c(seq(1,4, by = 0.2),9))
r <- tibble(from = seq(1,5), to = c(seq(2,5),8), class = LETTERS[seq(1,5)])
range_join <- function(x, y, value, left, right){
all_matches <- tibble()
x = as.data.frame(x)
y = as.data.frame(y)
x$index=x[,value]
for (i in 1:nrow(y)){
matches = x %>% filter(index>=y[i,left] & index<= y[i,right])
if (nrow(matches)>0){
all_matches = all_matches %>% bind_rows(matches %>% cbind(y[i,]))
}
}
all_matches = all_matches %>% select(-index)
return(all_matches)
}
data <- d %>%
range_join(r, "value", "from", "to")
data
I have a function checking zero numbers in each column in a large dataframe. Now I want to check zero numbers in each col after grouped by category.
Here is the example:
zero_rate <- function(df) {
z_rate_list <- sapply(df, function(x) {
data.frame(
n_zero=length(which(x==0)),
n=length(x),
z_rate=length(which(x==0))/length(x))
})
d <- data.frame(z_rate_list)
d <- sapply(d, unlist)
d <- as.data.frame(d)
return(d)}
df = data.frame(var1=c(1,0,NA,4,NA,6,7,0,0,10),var2=c(11,NA,NA,0,NA,16,0,NA,19,NA))
df1= data.frame(cat = c(1,1,1,1,1,2,2,2,2,2),df)
zero_rate_df = df1 %>% group_by(cat) %>% do( zero_rate(.))
Here zero_rate(df) works just as I expected. But when I group the data by cat and calculate in each category the zero_rate for each column, the result is not as I expected.
I expect something like this:
cat va1 var2
1 n_zero 1 1
n 5 5
z_rate 0.2 0.2
2 n_zero 2 1
n 5 5
z_rate 0.4 0.2
Any suggestion? Thank you.
I came up with the following code. .[-1] was used to remove grouping col:
zero_rate <- function(df){
res <- lapply(df, function(x){
y <- c(sum(x == 0, na.rm = T), length(x))
c(y, y[1]/y[2])
})
res <- do.call(cbind.data.frame, res)
res$vars <- c('n_zero', 'n', 'z_rate')
res
}
df1 %>% group_by(cat) %>% do( zero_rate(.[-1]))
# cat var1 var2 vars
# <dbl> <dbl> <dbl> <chr>
# 1 1 1.0 1.0 n_zero
# 2 1 5.0 5.0 n
# 3 1 0.2 0.2 z_rate
# 4 2 2.0 1.0 n_zero
# 5 2 5.0 5.0 n
# 6 2 0.4 0.2 z_rate
I am pretty sure I am complicating things. I have a data frame with p variables (here: v1 to v3) and two factor variable (here: sex and unemp):
> head(df)
sex unemp v1 v2 v3
1 0 0 2 4 4
2 0 0 2 1 1
3 1 0 3 3 5
4 1 1 2 3 5
5 0 0 1 2 5
6 1 0 3 5 4
I now would like to modify (i.e. compute median and mean and then rearrange the summary table) my data in such way that the resulting data frame looks like this (for men or women):
> df.res.men
median.unemp.1 median.unemp.0 mean.unemp.1 mean.unemp.0
v1 2.0 2.0 2.666667 2.391304
v2 2.0 3.5 2.500000 3.369565
v3 4.5 3.0 4.166667 2.956522
Here is the full code:
library(plyr)
## generate data
set.seed(1)
df <- data.frame(sex=rbinom(100, 1, 0.5),
unemp=rbinom(100, 1, 0.2),
v1=sample(1:5, 100, replace=TRUE),
v2=sample(1:5, 100, replace=TRUE),
v3=sample(1:5, 100, replace=TRUE)
)
head(df)
## compute mean and median for all variables by sex and unemp
df.mean <- ddply(df, .(unemp, sex), .fun=colMeans, na.rm=TRUE)
df.mean
df.median <- ddply(df, .(unemp, sex), .fun=function(x)apply(x,2,median, na.rm=TRUE))
df.median
## rearrange summary table
df.res.men <- cbind(t(subset(df.median, sex==0 & unemp==1)),
t(subset(df.median, sex==0 & unemp==0)),
t(subset(df.mean, sex==0 & unemp==1)),
t(subset(df.mean, sex==0 & unemp==0)))
df.res.men <- df.res.men[-c(1:2),]
colnames(df.res.men) <- c("median.unemp.1", "median.unemp.0",
"mean.unemp.1", "mean.unemp.0")
df.res.men
Here is one approach
library(plyr); library(reshape2)
dfm <- melt(df, id = c('sex', 'unemp'))
df2 <- ddply(dfm, .(variable, unemp, sex), summarize,
avg = mean(value), med = median(value))
df2m <- melt(df2, id = 1:3, variable.name = 'sum_fun')
df_0 <- dcast(df2m, sex + variable ~ sum_fun + unemp, subset = .(sex == 0))
sex variable avg_0 avg_1 med_0 med_1
1 0 v1 2.794872 3.0000 3 3.5
2 0 v2 3.102564 2.8750 3 3.0
3 0 v3 3.205128 3.1875 3 4.0
Here's a two-line solution using reshape alone. The default column names need a bit of work, but the syntax of the melt() and cast() statements is nicely expressive.
(One important note -- unlike reshape, reshape2 can not take a vector of summary function names as its fun.aggregate argument, as I've done below with c(mean, median). Thanks to Ramnath for pointing that out.)
library(reshape)
dmelt <- melt(df, id=c('sex', 'unemp'))
# Results for sex 0
cast(dmelt, variable ~ unemp, c(mean, median), subset = sex==0)
# variable 0_mean 0_median 1_mean 1_median
# 1 v1 2.391304 2.0 2.666667 2.0
# 2 v2 3.369565 3.5 2.500000 2.0
# 3 v3 2.956522 3.0 4.166667 4.5
# Results for sex 1
cast(dmelt, variable ~ unemp, c(mean, median), subset = sex==1)
# variable 0_mean 0_median 1_mean 1_median
# 1 v1 3.027778 3 2.416667 2.0
# 2 v2 2.638889 2 2.750000 3.0
# 3 v3 3.027778 3 2.583333 2.5
Solution without reshaping data.
f <- function(x) rbind(each(mean,median)(na.omit(x)))
#
# This should work but it doesn't.
# It almost work except labelling output with function names
#
df.res <- ddply(df,.(unemp, sex),.fun=numcolwise(f))
#
# Some workaround
#
df.res <- dlply(df,.(unemp, sex),.fun=numcolwise(f))
df.res <- cbind(attr(df.res,"split_labels"),do.call(rbind,df.res))