Related
I would like to rename station in DF to something like DA056 to Happy and AB786 to Sad.
library(tidyverse)
DF1 <- data.frame(Station = rep("DA056",3), Level = 100:102)
DF2 <- data.frame(Station = rep("AB786",3), Level = 201:203)
DF <- bind_rows(DF1,DF2)
We can use factor with labels specified for corresponding levels
library(dplyr)
DF <- DF %>%
mutate(Station = factor(Station, levels = c("DA056", "AB786"),
labels = c("Happy", "Sad")))
DF$Station
#[1] Happy Happy Happy Sad Sad Sad
#Levels: Happy Sad
Or with recode
DF %>%
mutate(Station = recode(Station, DA056 = 'Happy', AB786 = 'Sad'))
# Station Level
#1 Happy 100
#2 Happy 101
#3 Happy 102
#4 Sad 201
#5 Sad 202
#6 Sad 203
If there are many values to be changed, a better option is a join after creating a key/val dataset
keyval <- data.frame(Station = c("DA056", "AB786"),
val = c("Happy", "Sad"), stringsAsFactors = FALSE)
DF %>%
left_join(keyval) %>%
mutate(Station = coalesce(val, Station))
Or with base R
DF$Station <- with(df, factor(Station, levels = c("DA056", "AB786"),
labels = c("Happy", "Sad")))
An option is to use dplyr::case_when:
library(dplyr)
DF1 <- data.frame(Station = rep("DA056",3), Level = 100:102, stringsAsFactors = F)
DF2 <- data.frame(Station = rep("AB786",3), Level = 201:203, stringsAsFactors = F)
DF <- bind_rows(DF1,DF2)
DF <- DF %>% mutate(Station = case_when( Station == "DA056" ~ "Happy",
Station == "AB786" ~ "Sad",
TRUE ~ Station))
Output
> DF
Station Level
1 Happy 100
2 Happy 101
3 Happy 102
4 Sad 201
5 Sad 202
6 Sad 203
You can do it using case_when:
DF %>%
mutate(Station = case_when(Station == "DA056" ~ "Happy", Station =="AB786" ~ "Sad"))
Another simple solution
DF$Station = ifelse(DF$Station == "DA056", "Happy", "Sad")
I have a list (bbb) with 5 elements in it, i.e., each element for a year, like 2010, 2011, ... , 2014:
The first one in the list is this:
> bbb[1]
$`2010`
Date Average
X2010.01.01 2010-01-01 2.079090e-03
X2010.01.02 2010-01-02 5.147627e-04
X2010.01.03 2010-01-03 2.997464e-04
X2010.01.04 2010-01-04 1.375538e-04
X2010.01.05 2010-01-05 1.332109e-04
The second one in the list is this:
> bbb[2]
$`2011`
Date Average
X2011.01.01 2011-01-01 1.546253e-03
X2011.01.02 2011-01-02 1.152864e-03
X2011.01.03 2011-01-03 1.752446e-03
X2011.01.04 2011-01-04 2.639658e-03
X2011.01.05 2011-01-05 5.231150e-03
X2011.01.06 2011-01-06 8.909878e-04
And so on.
Here is my question:
How can I save all of these list's elements in 1 sheet of an Excel file to have something like this:
Your help would be highly appreciated.
You can do this using dcast.
bbb <- list(`2010` = data.frame(date = as.Date("2010-01-01") + 0:4,
avg = 1:5),
`2011` = data.frame(date = as.Date("2011-01-01") + 0:5,
avg = 11:16),
`2012` = data.frame(date = as.Date("2012-01-01") + 0:9,
avg = 21:30),
`2013` = data.frame(date = as.Date("2013-01-01") + 0:7,
avg = 21:28))
df <- do.call("rbind", bbb)
df$year <- format(df$date, format = "%Y")
df$month_date <- format(df$date, format = "%b-%d")
library(data.table)
library(openxlsx)
df_dcast <- dcast(df, month_date~year, value.var = "avg")
write.xlsx(df_dcast, "example1.xlsx")
Or using spread
library(dplyr)
library(tidyr)
df2 <- df %>%
select(-date) %>%
spread(key = year, value = avg)
write.xlsx(df2, "example2.xlsx")
This isn't very pretty, but it's the best I could think of right now. But you could take the dataframes and loop through the list, joining them by date like this:
library(tidyverse)
library(lubridate)
bbb <- list(`2010` = tibble(date = c('01-01-2010', '01-02-2010', '01-03-2010', '01-04-2010', '01-05-2010'),
average = 11:15),
`2011` = tibble(date = c('01-01-2011', '01-02-2011', '01-03-2011', '01-04-2011', '01-05-2011'),
average = 1:5),
`2012` = tibble(date = c('01-01-2012', '01-02-2012', '01-03-2012', '01-04-2012', '01-05-2012'),
average = 6:10))
for (i in seq_along(bbb)) {
if(i == 1){
df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(df) <- c('date', names(bbb[i])) # Assuming your list of dataframes has just 2 columns: date and average
} else {
join_df <- bbb[[i]] %>%
mutate(
date = paste(day(as.Date(date, format = '%m-%d-%Y')),
month(as.Date(date, format = '%m-%d-%Y'), label = TRUE),
sep = '-')
)
colnames(join_df) <- c('date', names(bbb[i]))
df <- full_join(df, join_df, by = 'date')
}
}
This loops through the list of dataframes and reformats the dates to Day-Month.
# A tibble: 5 x 4
date `2010` `2011` `2012`
<chr> <int> <int> <int>
1 1-Jan 11 1 6
2 2-Jan 12 2 7
3 3-Jan 13 3 8
4 4-Jan 14 4 9
5 5-Jan 15 5 10
You could then write that out with the writexl package function write_xlsx
I have two functions: date_diff and group_stat. So I have read this article tidyverse and I try so create simple functions and use the pipe.
The first function creates a difftime and names them timex_minus_timey but when I pipe this result into the next function I have to look at the name so I can fill in summary_var. Is there a better way to do this?
library(tidyverse)
#
set.seed(42)
data <- dplyr::bind_rows(
tibble::tibble(Hosp = rep("A", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60)),
tibble::tibble(Hosp = rep("B", 1000),
drg = sample(letters[1:5], 1000, replace = TRUE),
time1 = as.POSIXlt("2018-02-03 08:00:00", tz = "UTC") + rnorm(1000, 0, 60*60*60),
time2 = time1 + runif(1000, min = 10*60, max = 20*60))
)
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
out
}
group_stat <- function(df, group_var, summary_var, .f) {
func <- rlang::as_function(.f)
group_var <- rlang::enquo(group_var)
summary_var <-rlang::enquo(summary_var)
name <- paste0(rlang::quo_name(summary_var), "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise(!!name := func(!!summary_var, na.rm = TRUE))
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, summary_var = time2_minus_time1, mean)
#> # A tibble: 2 x 2
#> Hosp time2_minus_time1_mean
#> <chr> <dbl>
#> 1 A 15.1
#> 2 B 14.9
Created on 2019-05-02 by the reprex package (v0.2.1)
If you intend to always use these functions one after another in this way you could add an attribute containing the new column's name with date_diff, and have group_stat use that attribute. With the if condition, the attribute is only used if it exists and the summary_var argument is not provided.
date_diff <- function(df, stamp1, stamp2, units = "mins"){
stamp1 <- rlang::enquo(stamp1)
stamp2 <- rlang::enquo(stamp2)
name <- paste0(rlang::quo_name(stamp1), "_minus_", rlang::quo_name(stamp2))
out <- df %>%
dplyr::mutate(!!name := as.numeric(difftime(!!stamp1, !!stamp2, units=units)))
attr(out, 'date_diff_nm') <- name
out
}
group_stat <- function(df, group_var, summary_var, .f) {
if(!is.null(attr(df, 'date_diff_nm')) & missing(summary_var))
summary_var <- attr(df, 'date_diff_nm')
group_var <- rlang::enquo(group_var)
name <- paste0(summary_var, "_", deparse(substitute(.f)))
df %>%
dplyr::group_by(!!group_var) %>%
dplyr::summarise_at(summary_var, funs(!!name := .f), na.rm = T)
}
data %>%
date_diff(time2, time1) %>%
group_stat(Hosp, .f = mean)
# # A tibble: 2 x 2
# Hosp time2_minus_time1_mean
# <chr> <dbl>
# 1 A 15.1
# 2 B 14.9
I need to rename a dataframe by days in analysis.
names(dados) <- c("name", "day_1","Freq_1","Percent_1","day_2","Freq_2","Percent_2",
"day_3","Freq_3","Percent_3","day_4","Freq_4","Percent_4",
"day_5","Freq_5","Percent_5","day_6","Freq_6","Percent_6",
"day_7","Freq_7","Percent_7","day_8","Freq_8","Percent_8",
"day_9","Freq_9","Percent_9")
I'm doing an analysis that the data I get is in a list of dataframes, where each dataframe represents a day of analysis. I combine the dataframes and I have the columns 'name' unique and 'day_X', 'Freq_X' and 'Percent_X' for each dataframe as a return.
As return I need the columns to have the following names:
"name", "day_1","Freq_1","Percent_1","day_2","Freq_2","Percent_2","day_3","Freq_3","Percent_3"
How do I go about analyzing 50 days?
reproducible example:
day1 <- data.frame(name = c("jose", "mary", "julia"), freq = c(1,5,3), percent = c(40,30,20))
day2 <- data.frame(name = c("abner", "jose", "mary"), freq = c(3,5,4), percent = c(20,30,20))
day3 <- data.frame(name = c("abner", "jose", "mike"), freq = c(6,2,3), percent = c(40,30,70))
day4 <- data.frame(name = c("andre", "joseph", "ana"), freq = c(1,5,8), percent = c(40,30,20))
day5 <- data.frame(name = c("abner", "poli", "joseph"), freq = c(4,3,3), percent = c(10,30,10))
dates <- list(day1,day2,day4,day5)
data <- Reduce(function(x, y) merge(x, y, by = "name", all = TRUE), dates)
Here's a way to get what you want using the tidyverse suite of packages. We start by putting the data in the "long" format - but add a column with the date:
long_form <- dates %>%
imap_dfr(function(x, y) dplyr::mutate(x, day_num = y))
Now, to get the wide format you are after, we need to reformat things a bit, as done in the following code. I'm not sure what is supposed to go in the day_# variables, as #useR mentioned in the comments, so it's missing. If you have a variable called day, the code should automatically do the right thing as written.
wide_form <- long_form %>%
gather(key, value, -name,-day_num) %>%
dplyr::mutate(
key = paste(key, day_num, sep = "_")
) %>%
select(-day_num) %>%
spread(key, value)
One can use dplyr::bind_rows to merge all data frames form the list to a data frame. Please provide name to list so that day1, day2 etc can set beforehand. Finally, gather and spread is used to transform the data.
names(dates) <- paste("day", seq_along(dates), sep = "")
library(tidyverse)
bind_rows(dates,.id = "Name") %>%
group_by(Name) %>%
mutate(rn = row_number()) %>%
ungroup() %>%
gather(Key, value, -Name,-rn) %>%
unite("Key", c("Key", "Name")) %>%
spread(Key, value) %>%
select(-rn)
Result:
# # A tibble: 3 x 12
# freq_day1 freq_day2 freq_day3 freq_day4 name_day1 name_day2 name_day3 name_day4 percent_day1 percent_day2 percent~ percent~
# * <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
# 1 1 3 1 4 jose abner andre abner 40 20 40 10
# 2 5 5 5 3 mary jose joseph poli 30 30 30 30
# 3 3 4 8 3 julia mary ana joseph 20 20 20 10
#
Data:
Data is slightly modified from OP. I have included stringsAsFactors = FALSE argument as part of data.frame to avoid a mutate_at call to convert factor to character.
day1 <- data.frame(name = c("jose", "mary", "julia"), freq = c(1,5,3), percent = c(40,30,20), stringsAsFactors = FALSE)
day2 <- data.frame(name = c("abner", "jose", "mary"), freq = c(3,5,4), percent = c(20,30,20), stringsAsFactors = FALSE)
day3 <- data.frame(name = c("abner", "jose", "mike"), freq = c(6,2,3), percent = c(40,30,70), stringsAsFactors = FALSE)
day4 <- data.frame(name = c("andre", "joseph", "ana"), freq = c(1,5,8), percent = c(40,30,20), stringsAsFactors = FALSE)
day5 <- data.frame(name = c("abner", "poli", "joseph"), freq = c(4,3,3), percent = c(10,30,10), stringsAsFactors = FALSE)
dates <- list(day1,day2,day4,day5)
As described in numerous questions on here, I should be able to take a data.frame, group it, sort by date, and then apply cumsum, to get the cumulative sum over time per grouping.
Instead, with dplyr 0.8.0, I'm getting cumulative sums that ignore the grouping.
Example code:
data.frame(
cat = sample(c("a", "b", "c"), size = 1000, replace = T),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 1000, replace=T)
) %>%
mutate(
x = 1
) %>%
arrange(date) %>%
group_by(cat) %>%
mutate(x = cumsum(x)) %>%
tail()
Now, I'd expect the last few rows to have x equal to around 300-something, for each group.
Instead I get:
# A tibble: 6 x 3
# Groups: cat [2]
cat date x
<chr> <date> <dbl>
1 a 1999-12-31 995
2 a 1999-12-31 996
3 c 2000-01-01 997
4 a 2000-01-01 998
5 c 2000-01-01 999
6 a 2000-01-01 1000
What am I doing wrong?
I'm guessing this is a classic problem when you load plyr after dplyr, nothing to do with your version of dplyr. For example:
tmp1<- data.frame(cat = sample(c("a", "b", "c"), size = 1000, replace = T),
date = sample(seq(as.Date('1999/01/01'), as.Date('2000/01/01'), by="day"), 1000, replace=T)) %>% mutate(x = 1)
see difference between
tmp1 %>%
arrange(date) %>%
group_by(cat) %>%
plyr::mutate(x = cumsum(x)) %>%
tail()
and
tmp1 %>%
arrange(date) %>%
group_by(cat) %>%
dplyr::mutate(x = cumsum(x)) %>%
tail()
plyr's mutate doesn't understand grouping.
You can verify if this is the problem using search()