Note that the values of my column time in the output table are rounded, but I would like to leave the values two decimal places after the comma, how to adjust this in the code below?
library(dplyr)
Test <- structure(list(date1 = as.Date(c("2021-11-01","2021-11-01","2021-11-01","2021-11-01")),
date2 = as.Date(c("2021-10-22","2021-10-22","2021-10-28","2021-10-30")),
Week = c("Friday", "Friday", "Thursday", "thursday"),
Category = c("FDE", "FDE", "FDE", "FDE"),
time = c(4, 6, 6, 3)), class = "data.frame",row.names = c(NA, -4L))
Test<-Test %>%
group_by(Week = tools::toTitleCase(Week), Category) %>%
summarise(time = mean(time, na.rm = TRUE), .groups = 'drop')
Test <- transform(Test, time = round(time))
> Test
Week Category time
1 Friday FDE 5
2 Thursday FDE 4
An alternative to Onyambu's suggestion is:
Test <- transform(Test, time = format(round(time, digits = 2), nsmall = 2))
The nsmall argument of format sets the minimum number of digits to the right of the decimal.
Related
I would like to solve some problems with the column name, which corroborates with errors when executing a code. Here, I'll show you a simple example. Note that I have a column called TimeofCalculate and the code below is Timeofcalculate, which gives an error, because the code is calculate instead of Calculate. However, I would like any of them worked in the code. Also, I have a database which is Timeofcalculâte column. This â is common where I live. Therefore, I would like to resolve these mentioned issues.
library(dplyr)
Test <- structure(list(date1 = as.Date(c("2021-11-01","2021-11-01","2021-11-01","2021-11-01")),
date2 = as.Date(c("2021-10-22","2021-10-22","2021-10-28","2021-10-30")),
Week = c("Friday", "Friday", "Thursday", "thursday"),
Category = c("FDE", "FDE", "FDE", "FDE"),
TimeofCalculate = c(4, 6, 6, 3)), class = "data.frame",row.names = c(NA, -4L))
Test %>%
group_by(Week = tools::toTitleCase(Week)) %>%
summarise(Time=mean(Timeofcalculate), .groups = 'drop')
I think weekdays in different spellings are unacceptable in a data base, first fix this. We may use built-in tools::toTitleCase to make first letters upper-case.
Test <- transform(Test, Week=tools::toTitleCase(Week))
Then, we may easily aggregate by column numbers, so no names are needed.
aggregate(list(Time=Test[, 5]), list(Week=Test[, 3]), mean)
# Week Time
# 1 Friday 5.0
# 2 Thursday 4.5
If it's a problem to hard-code column indices by hand, we may use agrep which identifies via string distance matching the index of the most similar column name.
c_tcalc <- agrep('timeofcalculate', names(Test))
c_week <- agrep('week', names(Test))
aggregate(list(Time=Test[, c_tcalc]), list(Week=Test[, c_week]), mean)
# Week Time
# 1 Friday 5.0
# 2 Thursday 4.5
Data:
Test <- structure(list(date1 = structure(c(18932, 18932, 18932, 18932
), class = "Date"), date2 = structure(c(18922, 18922, 18928,
18930), class = "Date"), Week = c("Friday", "Friday", "Thursday",
"Thursday"), Category = c("FDE", "FDE", "FDE", "FDE"), TimeofCalculate = c(4,
6, 6, 3)), class = "data.frame", row.names = c(NA, -4L))
Perhaps we can take advantage of tidyselect::matches.
library(dplyr)
nms <- c('TimeofCalculate|Timeofcalculate|Timeofcalculâte')
#alternative one
Test %>%
group_by(Week = tools::toTitleCase(Week)) %>%
summarise(across(matches(nms), mean), .groups = 'drop')
#> # A tibble: 2 × 2
#> Week TimeofCalculate
#> <chr> <dbl>
#> 1 Friday 5
#> 2 Thursday 4.5
#using a purrr style lambda
Test %>%
group_by(Week = tools::toTitleCase(Week)) %>%
summarise(across(matches(nms), ~mean(., na.rm = TRUE)), .groups = 'drop')
#> # A tibble: 2 × 2
#> Week TimeofCalculate
#> <chr> <dbl>
#> 1 Friday 5
#> 2 Thursday 4.5
#this will also work
Test %>%
group_by(Week = tools::toTitleCase(Week)) %>%
summarise(across(any_of(c("Timeofcalculate", "TimeofCalculate", "Timeofcalculâte")), ~ mean(., na.rm = TRUE)), .groups = "drop")
Created on 2021-12-26 by the reprex package (v2.0.1)
Hi I have time series data that has daily dates (variable 1) and then for each date I have a time variable that is assigned from (1-60). On each day there is a number X events. Is there a way to create a new dataset where 2 day aggregates for my value are summed across and I have 30 rows (time variables) instead of 60?
Update: Here is a reproducible example of what I want
set.seed(101)
df <- data.frame(
dte = c(as.Date("2021-01-01"),
as.Date("2021-01-02") ,
as.Date("2021-01-03"),
as.Date("2021-01-04") ,
as.Date("2021-02-01") ,
as.Date("2021-02-02") ,
as.Date("2021-02-03") ,
as.Date("2021-02-04")
),
tme = rep(c(1, 2, 3, 4)),
val1 = sample(1:8),
work_type = c("Construction Worker", "Construction Worker","Construction
Worker", "Construction Worker", "Sales", "Sales", "Sales", "Sales"),
Work_Site = "A"
)
print(df)
df_2day <- data.frame(
tme = rep(c(1, 2)),
val1 = c(9,13,5,9),
work_type = c("Construction Worker", "Construction Worker",
"Sales", "Sales"),
Work_Site = "A"
)
print(df_2day)
I also have facility B, C, D
You can create group of 2 days and sum the val1 values.
library(lubridate)
library(dplyr)
df %>%
group_by(Work_Site, work_type, grp = ceiling_date(dte, '2 days')) %>%
summarise(val1 = sum(val1))
# Work_Site work_type grp val1
# <chr> <chr> <date> <int>
#1 A Construction Worker 2021-01-03 9
#2 A Construction Worker 2021-01-05 15
#3 A Sales 2021-02-03 5
#4 A Sales 2021-02-05 7
You can identify the groupings by dividing the row number for each day by two and rounding up the the nearest whole number. So the 3rd reading would be 3/2 = 1.5, rounded up to be group 2. The 10th would be 10/2 = group 5.
Below is an implementation using dplyr, but you could use something else...
library(dplyr)
df <- data.frame(
dte = c(as.Date("2021-01-01"),
as.Date("2021-01-01") ,
as.Date("2021-01-01"),
as.Date("2021-01-01") ,
as.Date("2021-02-01") ,
as.Date("2021-02-01") ,
as.Date("2021-02-01") ,
as.Date("2021-02-01")
),
tme = rep(c(1, 2, 3, 4)),
val1 = sample(1:8),
val2 = sample(1:8)
)
print(df)
result <- df %>%
group_by(dte) %>%
mutate(dategroup=ceiling(rank(tme) / 2)) %>%
group_by(dte, dategroup) %>%
summarise_all(sum)
print(result)
I have a following example dataset:
df <- data.frame("id" = c(1,2,3,3,4),
"start" = c(01-01-2018,01-06-2018,01-05-2018,01-05-2018,01-05-2018, 01-10-2018),
"end" = c(01-03-2018,01-07-2018,01-09-2018,01-06-2018,01-06-2018,01-11-2018))
df$start <- as.Date(df$start, "%d-%m-%Y")
df$end <- as.Date(df$end, "%d-%m-%Y")
What I want to do with it is for each group to get a union of all date intervals), i.e.
01-01-2018 - 01-03-2018 for group 1
01-06-2018 - 01-06-2018 for group 2
01-05-2018 - 01-09-2018 for group 3
01-05-2018 - 01-06-2018 and 01-10-2018 - 01-11-2018 for group 4
The purpose of this is to have an interval as an output, because I need it to determine whether certain observation dates for the group fall in the intervals or not.
We convert the 'start', 'end' to Date class, then grouped by'id', created an interval column in summarise based on the min and max of the 'start', and 'end' columns respectively
library(dplyr)
library(stringr)
library(lubridate)
df %>%
mutate(across(c(start, end), mdy)) %>%
group_by(id) %>%
summarise(interval = interval(min(start), max(end)), .groups = 'drop')
data
df <- structure(list(id = c(1, 2, 3, 3, 4), start = c("01-01-2018",
"01-06-2018", "01-05-2018", "01-05-2018", "01-10-2018"), end = c("01-03-2018",
"01-07-2018", "01-09-2018", "01-06-2018", "01-11-2018")),
class = "data.frame", row.names = c(NA,
-5L))
Reproducible data sample dput output:
structure(list(id = c(1, 1, 1, 2, 3, 3, 4),
start = structure(c(1546326000,
1546326060, 1546326270, 1546722600, 1546884300, 1546884720,
1547102430), tzone = "UTC", class = c("POSIXct", "POSIXt")),
event_time = structure(c(1546326059, 1546326059, 1546326059,
1546722930, 1546884480, 1546884480, NA),
tzone = "UTC", class = c("POSIXct", "POSIXt"))),
.Names = c("id", "start", "event_time"), row.names = c(NA, -7L),
class = "data.frame")
I have some messy data that was merged from different sources, and am trying to create a new logical variable, which identifies which observation within the group (id) has the least positive time difference between the start and event_time variables, with a desire to do this within dplyr.
I've tried a few approaches but can't find something that works. So far I'm thinking to create a new variable which calculates the difference in time between event and start or forces it to NA if that difference is negative, and then create the desired variable off of this.
The code:
dat %>% mutate(difference = ifelse(event_time > start,
event_time - start,
NA)) %>%
mutate(difference = as.integer(difference)) %>%
group_by(id) %>%
mutate(is_closest = row_number() == which.min(difference))
This gives me an error though, which does not create the variable is_closest.
What I'm looking for in its simplest form is:
code to create a variable identifying the closest value to some other reference value (another variable or some provided quantity including in this case, time) within a group of observations
a more sensible way to identify the closest time within a group
Check this solution:
library(lubridate)
library(dplyr)
dat %>%
mutate(time_diff = start %--% event_time %>% as.numeric()) %>%
group_by(id) %>%
mutate(
min_diff = time_diff[time_diff >= 0] %>% min(),
min_diff_gr = time_diff == min_diff
)
I am trying to create a dataset which is based on the difference in the days of start & end date. as an example
Name Start_Date End_Date
Alice 1-1-2017 3-1-2017
John 4-3-2017 5-3-2017
Peter 12-3-2017 12-3-2017
So, the final dataset will be inclusive of the start, end date and also the difference. And eventually it should look something like
Name Date
Alice 1-1-2017
Alice 2-1-2017
Alice 3-1-2017
John 4-3-2017
John 5-3-2017
Peter 12-3-2017
Every help is Great Help. Thanks !
We can use Map to get the sequence and melt the list to data.frame`
df1[-1] <- lapply(df1[-1], as.Date, format = "%d-%m-%Y")
lst <- setNames(Map(function(x, y) seq(x, y, by = "1 day"),
df1$Start_Date, df1$End_Date), df1$Name)
library(reshape2)
melt(lst)[2:1]
data
df1 <- structure(list(Name = c("Alice", "John", "Peter"), Start_Date = structure(c(17167,
17229, 17237), class = "Date"), End_Date = structure(c(17169,
17230, 17237), class = "Date")), .Names = c("Name", "Start_Date",
"End_Date"), row.names = c(NA, -3L), class = "data.frame")
This uses the expandRows function from the package splitstackshape:
df = df %>%
mutate(days_between = as.numeric(End_Date - Start_Date),
id = row_number(Name)) %>%
expandRows("days_between") %>%
group_by(id) %>%
mutate(Date = seq(first(Start_Date),
first(End_Date) - 1,
by = 1)) %>%
ungroup()
using a for loop:
library(data.table)
library(foreach)
library(lubridate)
setDT(df)
names = df[, unique(Name)]
l = foreach(i = 1:length(names)) %do% {
# make a date sequence per name
d = df[Name == names[i], ]
s = seq(from = dmy(d$Start_Date), to = dmy(d$End_Date), by = "days")
# bind the results in a data.table
dx = data.table(name = rep(names[i], length(s)))
dx = cbind(dx, date = s)
}
rbindlist(l)