How to maintain date format when after applying function - r

I have dataframe with a poorly formatted date information.
date = c("18102016", "11102017", "4052017", "18102018", "3102018")
df <- data.frame(date = date, x1 = 1:5, x2 = rep(1,5))
I have already written the function fix_date_all() which does the proper formatting when applied to the vector df$date
fix_date_all<- function(date){
fix_date <- function(d) {
if (nchar(d) != 8) d <- paste0("0", d)
dd <- d %>% substr(1,2)
mm <- d %>% substr(3,4)
yyyy <- d %>% substr(5,8)
d <- paste0(dd, ".", mm, ".", yyyy) %>% as.Date("%d.%m.%Y")
d
}
lapply(date, fix_date)
}
fix_date_all(df$date)
Now I would like to transform this variable to a proper date format using a tidyverse like style:
df %>% mutate(across(date, fix_date_all))
However, when using it in a tidyverse style, the date gets screwed up.
date x1 x2
1 17092 1 1
2 17450 2 1
3 17290 3 1
4 17822 4 1
5 17807 5 1

A second option would be to get rid of lapply and rewrite your function using e.g. string::str_pad:
library(dplyr, warn.conflicts = FALSE)
fix_date_all<- function(date){
date %>%
stringr::str_pad(width = 8, pad = "0") %>%
as.Date(format = "%d%m%Y")
}
fix_date_all(df$date)
#> [1] "2016-10-18" "2017-10-11" "2017-05-04" "2018-10-18" "2018-10-03"
df %>%
mutate(across(date, fix_date_all))
#> date x1 x2
#> 1 2016-10-18 1 1
#> 2 2017-10-11 2 1
#> 3 2017-05-04 3 1
#> 4 2018-10-18 4 1
#> 5 2018-10-03 5 1

The output is a list from the lapply call.
fix_date_all(df$date)
[[1]]
[1] "2016-10-18"
[[2]]
[1] "2017-10-11"
[[3]]
[1] "2017-05-04"
[[4]]
[1] "2018-10-18"
[[5]]
[1] "2018-10-03"
We need to flatten it with c
library(dplyr)
df %>%
mutate(date = fix_date_all(date) %>%
do.call(c, .))
-output
date x1 x2
1 2016-10-18 1 1
2 2017-10-11 2 1
3 2017-05-04 3 1
4 2018-10-18 4 1
5 2018-10-03 5 1
Or in the newer version of purrr, use list_c
library(purrr)
df %>%
mutate(date = fix_date_all(date) %>% list_c)
date x1 x2
1 2016-10-18 1 1
2 2017-10-11 2 1
3 2017-05-04 3 1
4 2018-10-18 4 1
5 2018-10-03 5 1

The sprintf will prepend with a 0 if short and then we convert to Date. No packages are used.
as.Date(sprintf("%08d", as.numeric(date)), "%d%m%Y")
## [1] "2016-10-18" "2017-10-11" "2017-05-04" "2018-10-18" "2018-10-03"
Note that it is vectorized and works within mutate:
library(dplyr)
data.frame(date) %>%
mutate(date = as.Date(sprintf("%08d", as.numeric(date)), "%d%m%Y"))
## date
## 1 2016-10-18
## 2 2017-10-11
## 3 2017-05-04
## 4 2018-10-18
## 5 2018-10-03

Instead of lapply use sapply. But at the same time, just use vectorized ifelse as shown below:
fix_date_all<- function(d){
d <- ifelse(nchar(d) != 8, paste0("0", d), d)
as.Date(d, "%d%m%Y")
}
df %>%
mutate(date = fix_date_all(date))
date x1 x2
1 2016-10-18 1 1
2 2017-10-11 2 1
3 2017-05-04 3 1
4 2018-10-18 4 1
5 2018-10-03 5 1
>

Related

Rename variables (columns) in datasets that are in a list

I have a list of datasets with different variables. I need to rename them according to the naming convention in the name dataframe below.
df1 <- data.frame(x1= c(1,2,3), x2=c(1,2,3))
df2 <- data.frame(x1= c(1,2,3), x3=c(1,2,3))
df3 <- data.frame(x4= c(1,2,3), x5=c(1,2,3))
mylist <- list(df1,df2,df3)
name <- data.frame(old= c("x1","x2","x3","x4","x5"), new=c("A","B","A","A","C"))
I can do this one by one, but I am wondering how to be more efficient and rename them all at once
newdf <- map_if(mylist, ~ "x1" %in% colnames(.x),
.f = list(. %>% rename("A"="x1")))
I was hoping something like this would work, but it doesn't:
for (i in nrow(name)) {
newdf <- map_if(mylist, ~ name[i,1] %in% colnames(.x),
.f = list(. %>% rename(name[2] = name[1])))
}
You can use setnames from data.table, which can take a list of old and new names.
library(data.table)
library(purrr)
map(mylist, ~ setnames(.x, name$old, name$new, skip_absent=TRUE))
Output
[[1]]
A B
1 1 1
2 2 2
3 3 3
[[2]]
A A
1 1 1
2 2 2
3 3 3
[[3]]
A C
1 1 1
2 2 2
3 3 3
Column names must be unique, so there is a typo (?) in your example (as "x1" and "x3" would both be re-labelled as "A").
If we fix the typo, here is an option using map and rename_with.
name <- data.frame(old= c("x1","x2","x3","x4","x5"), new=c("A","B","C","D","E"))
library(tidyverse)
mylist %>%
map(function(df) df %>% rename_with(~ name$new[match(.x, name$old)]))
#[[1]]
# A B
#1 1 1
#2 2 2
#3 3 3
#
#[[2]]
# A C
#1 1 1
#2 2 2
#3 3 3
#
#[[3]]
# D E
#1 1 1
#2 2 2
#3 3 3
You could use set_names + recode:
library(tidyverse)
map(mylist, set_names, ~ recode(.x, !!!deframe(name)))
[[1]]
A B
1 1 1
2 2 2
3 3 3
[[2]]
A A
1 1 1
2 2 2
3 3 3
[[3]]
A C
1 1 1
2 2 2
3 3 3

Easy way to convert text columns to numeric

I have this example dataset
x <- c("hot", "cold", "warm", "hot", "hot")
y <- c("happy", "content", "happy", "sad", "annoyed")
df <- data.frame(x, y)
I want to find a quick way to convert the text to numbers, it doesn't matter which order the numbers are.
So the output would be:
x y
1 1
2 2
3 1
1 3
1 4
Many Thanks
With Base R:
df[] <- lapply(df, function(x) as.numeric(as.factor(x)))
df
#> x y
#> 1 2 3
#> 2 1 2
#> 3 3 3
#> 4 2 4
#> 5 2 1
With purrr:
library(purrr)
df %>% map(as.factor) %>% map_dfc(as.numeric)
#> # A tibble: 5 x 2
#> x y
#> <dbl> <dbl>
#> 1 2 3
#> 2 1 2
#> 3 3 3
#> 4 2 4
#> 5 2 1
Keep track of the labels with labelled:
df <- df %>% map(as.factor) %>% map_dfc(labelled::to_labelled)
df
#> # A tibble: 5 x 2
#> x y
#> <dbl+lbl> <dbl+lbl>
#> 1 2 [hot] 3 [happy]
#> 2 1 [cold] 2 [content]
#> 3 3 [warm] 3 [happy]
#> 4 2 [hot] 4 [sad]
#> 5 2 [hot] 1 [annoyed]
df$x
#> <labelled<double>[5]>
#> [1] 2 1 3 2 2
#>
#> Labels:
#> value label
#> 1 cold
#> 2 hot
#> 3 warm
Or keep the numbers next to the original values in a new column:
df[paste0(names(df), "_num")] <- lapply(df, function(x) as.numeric(as.factor(x)))
df
#> x y x_num y_num
#> 1 hot happy 2 3
#> 2 cold content 1 2
#> 3 warm happy 3 3
#> 4 hot sad 2 4
#> 5 hot annoyed 2 1
If you want to change only the character columns to numeric:
library(purrr)
df %>% map_if(is.character, as.factor) %>% map_dfc(as.numeric)
df %>% map_if(is.character, as.factor) %>% map_dfc(labelled::to_labelled)
Or choose them by name:
library(purrr)
cols <- c("x", "y")
df %>% map_at(cols, as.factor) %>% map_dfc(as.numeric)
df %>% map_at(cols, as.factor) %>% map_dfc(labelled::to_labelled)
df[paste0(cols, "_num")] <- lapply(df[cols], function(x) as.numeric(as.factor(x)))
You could use rapply:
rapply(type.convert(df), function(x)as.integer(factor(x, unique(x))),'factor',how = 'replace')
x y
1 1 1
2 2 2
3 3 1
4 1 3
5 1 4
Maybe try this with dplyr:
library(dplyr)
#Code
newdf <- df %>% mutate(across(everything(),~as.numeric(as.factor(.))))
Output:
x y
1 2 3
2 1 2
3 3 3
4 2 4
5 2 1
In order to see the values, you can try this:
#Code 2
newdf2 <- df %>% mutate(across(everything(),~as.factor(.))) %>%
mutate(across(everything(),.fns = list(value = ~ as.numeric(.))))
Output:
x y x_value y_value
1 hot happy 2 3
2 cold content 1 2
3 warm happy 3 3
4 hot sad 2 4
5 hot annoyed 2 1
If we add a numeric variable, this should work:
#Code 3
newdf <- df %>% mutate(across(x:y,~as.factor(.))) %>%
mutate(across(x:y,.fns = list(value = ~ as.numeric(.))))
Output:
x y number x_value y_value
1 hot happy 10 2 3
2 cold content 20 1 2
3 warm happy 30 3 3
4 hot sad 40 2 4
5 hot annoyed 50 2 1
We can use match
df[] <- lapply(df, function(x) match(x, unique(x)))

How to select continuous rows in R data frame based on conditions?

I have a dataframe df and it has date ,group and gap days column. I want to select for a group all rows where gap days is continuously 1 from latest date (max date) . If gap days is not equal to 1 , then we ignore rows till the point where gap days is not equal to 1 . For reproducible purpose I have created current df and expected df...
df<-data.frame(Date=c("2018-10-15","2018-10-16","2018-10-17",
"2018-10-14","2018-10-15","2018-10-16","2018-10-18","2018-10-19",
"2018-10-18","2018-10-21","2018-10-23","2018-10-24","2018-10-27","2018-10-28"),Group=c("a","a","a","b","b","b","b","b","c","c","c","c","c","c"),Gap_Days=c(1,1,1,1,1,2,1,1,3,2,1,3,1,1))
df_expected<-data.frame(Date=c("2018-10-15","2018-10-16","2018-10-17","2018-10-18","2018-10-19","2018-10-27","2018-10-28"),Group=c("a","a","a", "b","b","c","c"),Gap_Days=c(1,1,1,1,1,1,1))
The only difference between my first comment and what works now is the introduction of grouping to the question.
Base R:
do.call("rbind", by(df, df$Group, FUN=function(d) d[rev(cumall(rev(d$Gap_Days == 1))),]))
# Date Group Gap_Days
# a.1 2018-10-15 a 1
# a.2 2018-10-16 a 1
# a.3 2018-10-17 a 1
# b.7 2018-10-18 b 1
# b.8 2018-10-19 b 1
# c.13 2018-10-27 c 1
# c.14 2018-10-28 c 1
Tidyverse:
df %>%
group_by(Group) %>%
filter(rev(cumall(rev(Gap_Days == 1)))) %>%
ungroup()
# # A tibble: 7 x 3
# Date Group Gap_Days
# <fct> <fct> <dbl>
# 1 2018-10-15 a 1
# 2 2018-10-16 a 1
# 3 2018-10-17 a 1
# 4 2018-10-18 b 1
# 5 2018-10-19 b 1
# 6 2018-10-27 c 1
# 7 2018-10-28 c 1
Here is one method with tidyverse
library(dplyr)
library(data.table)
df %>%
group_by(grp = rleid(Gap_Days),
ind = any(Date == max(.data$Date))) %>%
ungroup %>%
filter(grp == max(grp) & ind) %>%
select(-ind, -grp)
# A tibble: 3 x 2
# Date Gap_Days
# <date> <dbl>
#1 2018-10-19 1
#2 2018-10-20 1
#3 2018-10-21 1
If the 'Date' column is already ordered, then we just need to check the 1s in 'Gap_Days
i1 <- inverse.rle(within.list(rle(df$Gap_Days == 1),
values[lengths < max(lengths) & values] <- FALSE))
df[i1,, drop = FALSE]

insert rows between dates by group

I want to insert rows between two dates by group. My way of doing it is so complicated that I insert missing values by last observation carry forwards and then merge. I was wondering is there any easier way to achieve it.
# sample data
user<-c("A","A","B","B","B")
dummy<-c(1,1,1,1,1)
date<-as.Date(c("2017/1/3","2017/1/6","2016/5/1","2016/5/3","2016/5/5"))
dt<-data.frame(user,dummy,date)
user dummy date
1 A 1 2017-01-03
2 A 1 2017-01-06
3 B 1 2016-05-01
4 B 1 2016-05-03
5 B 1 2016-05-05
Desired output
By using dplyr and tidyr :)(one line solution )
library(dplyr)
library(tidyr)
dt %>% group_by(user) %>% complete(date=full_seq(date,1),fill=list(dummy=0))
# A tibble: 9 x 3
# Groups: user [2]
user date dummy
<fctr> <date> <dbl>
1 A 2017-01-03 1
2 A 2017-01-04 0
3 A 2017-01-05 0
4 A 2017-01-06 1
5 B 2016-05-01 1
6 B 2016-05-02 0
7 B 2016-05-03 1
8 B 2016-05-04 0
9 B 2016-05-05 1
you can try this
library(data.table)
setDT(dt)
tmp <- dt[, .(date = seq.Date(min(date), max(date), by = '1 day')), by =
'user']
dt <- merge(tmp, dt, by = c('user', 'date'), all.x = TRUE)
dt[, dummy := ifelse(is.na(dummy), 0, dummy)]
We can use the tidyverse to achieve this task.
library(tidyverse)
dt2 <- dt %>%
group_by(user) %>%
do(date = seq(from = min(.$date), to = max(.$date), by = 1)) %>%
unnest() %>%
left_join(dt, by = c("user", "date")) %>%
replace_na(list(dummy = 0)) %>%
select(colnames(dt))
dt2
# A tibble: 9 x 3
user dummy date
<fctr> <dbl> <date>
1 A 1 2017-01-03
2 A 0 2017-01-04
3 A 0 2017-01-05
4 A 1 2017-01-06
5 B 1 2016-05-01
6 B 0 2016-05-02
7 B 1 2016-05-03
8 B 0 2016-05-04
9 B 1 2016-05-05
The simplest way that I have found to do this is with the padr library.
library(padr)
dt_padded <- pad(dt, group = "user", by = "date") %>%
replace_na(list(dummy=0))
A Base R (not quite as elegant) solution:
# Data
user<-c("A","A","B","B","B")
dummy<-c(1,1,1,1,1)
date<-as.Date(c("2017/1/3","2017/1/6","2016/5/1","2016/5/3","2016/5/5"))
df1 <-data.frame(user,dummy,date)
# Solution
do.call(rbind, lapply(split(df1, df1$user), function(df) {
dff <- data.frame(user=df$user[1], dummy=0, date=seq.Date(min(df$date), max(df$date), 'day'))
dff[dff$date %in% df$date, "dummy"] <- df$dummy[1]
dff
}))
# user dummy date
# A 1 2017-01-03
# A 0 2017-01-04
# A 0 2017-01-05
# A 1 2017-01-06
# B 1 2016-05-01
# B 0 2016-05-02
# B 1 2016-05-03
# B 0 2016-05-04
# B 1 2016-05-05
Assuming your data is called df1, and you want to add dates between two days try this:
library(dplyr)
df2 <- seq.Date(as.Date("2015-01-03"), as.Date("2015-01-06"), by ="day")
left_join(df2, df1)
If you're simply trying to add a new record, I suggest using rbind.
rbind()

Count number of values in row using dplyr

This question should have a simple, elegant solution but I can't figure it out, so here it goes:
Let's say I have the following dataset and I want to count the number of 2s present in each row using dplyr.
set.seed(1)
ID <- LETTERS[1:5]
X1 <- sample(1:5, 5,T)
X2 <- sample(1:5, 5,T)
X3 <- sample(1:5, 5,T)
df <- data.frame(ID,X1,X2,X3)
library(dplyr)
Now, the following works:
df %>%
rowwise %>%
mutate(numtwos = sum(c(X1,X2,X3) == 2))
But how do I avoid typing out all of the column names?
I know this is probably easier to do without dplyr, but more generally I want to know how I can use dplyr's mutate with multiple columns without typing out all the column names.
Try rowSums:
> set.seed(1)
> ID <- LETTERS[1:5]
> X1 <- sample(1:5, 5,T)
> X2 <- sample(1:5, 5,T)
> X3 <- sample(1:5, 5,T)
> df <- data.frame(ID,X1,X2,X3)
> df
ID X1 X2 X3
1 A 2 5 2
2 B 2 5 1
3 C 3 4 4
4 D 5 4 2
5 E 2 1 4
> rowSums(df == 2)
[1] 2 1 0 1 1
Alternatively, with dplyr:
> df %>% mutate(numtwos = rowSums(. == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
Here's another alternative using purrr:
library(purrr)
df %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"
)
Which gives:
#Source: local data frame [5 x 5]
#
# ID X1 X2 X3 numtwos
# <fctr> <int> <int> <int> <int>
#1 A 2 5 2 2
#2 B 2 5 1 1
#3 C 3 4 4 0
#4 D 5 4 2 1
#5 E 2 1 4 1
As per mentioned in the NEWS, row based functionals are still maturing in dplyr:
We are still figuring out what belongs in dplyr and what belongs in
purrr. Expect much experimentation and many changes with these
functions.
Benchmark
We can see how rowwise() and do() compare to purrr::by_row() for this type of problem and how they "perform" against rowSums() and the tidy data way:
largedf <- df[rep(seq_len(nrow(df)), 10e3), ]
library(microbenchmark)
microbenchmark(
steven = largedf %>%
by_row(function(x) {
sum(x[-1] == 2) },
.to = "numtwos",
.collate = "cols"),
psidom = largedf %>%
rowwise %>%
do(data_frame(numtwos = sum(.[-1] == 2))) %>%
cbind(largedf, .),
gopala = largedf %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(largedf, .),
evan = largedf %>%
mutate(numtwos = rowSums(. == 2)),
times = 10L,
unit = "relative"
)
Results:
#Unit: relative
# expr min lq mean median uq max neval cld
# steven 1225.190659 1261.466936 1267.737126 1227.762573 1276.07977 1339.841636 10 b
# psidom 3677.603240 3759.402212 3726.891458 3678.717170 3728.78828 3777.425492 10 c
# gopala 2.715005 2.684599 2.638425 2.612631 2.59827 2.572972 10 a
# evan 1.000000 1.000000 1.000000 1.000000 1.00000 1.000000 10 a
Just wanted to add to the answer of #evan.oman in case you only want to sum rows for specific columns, not all of them. You can use the regular select and/or select_helpers functions. In this example, we don't want to include X1 in rowSums:
df %>%
mutate(numtwos = rowSums(select(., -X1) == 2))
ID X1 X2 X3 numtwos
1 A 2 5 2 1
2 B 2 5 1 0
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 0
One approach is to use a combination of dplyr and tidyr to convert data into long format, and do the computation:
library(dplyr)
library(tidyr)
df %>%
gather(key, value, -ID) %>%
group_by(ID) %>%
summarise(numtwos = sum(value == 2)) %>%
inner_join(df, .)
Output is as follows:
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1
You can use do, which doesn't add the column to your original data frame and you need to add the column to your original data frame.
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame
numtwos
1 2
2 1
3 0
4 1
5 1
Add a cbind to bind the new column to the original data frame:
df %>%
rowwise %>%
do(numtwos = sum(.[-1] == 2)) %>%
data.frame %>% cbind(df, .)
ID X1 X2 X3 numtwos
1 A 2 5 2 2
2 B 2 5 1 1
3 C 3 4 4 0
4 D 5 4 2 1
5 E 2 1 4 1

Resources