I have a two dataframes - one is the base dataframe and the other the query dataframe.
Base Dataframe (base_df):
Mon Tue Wed Thu Fri Sat
A 5.23 0.01 6.81 8.67 0.10 6.21
B 6.26 2.19 4.28 5.57 0.16 2.81
C 7.41 2.63 4.32 6.57 0.20 1.69
D 6.17 1.50 5.30 9.22 2.19 5.47
E 1.23 9.01 8.09 1.29 7.65 4.57
Query Dataframe (query_df):
Person Start End
A Tue Thu
C Mon Wed
D Thu Sat
C Thu Sat
B Wed Fri
I want to extract all the observations for a particular person between the start and end days. The difference between start and end days is always three (inclusive of start and end days).
Hence the output wanted is:
Person Start End D1 D2 D3
A Tue Thu 0.01 6.81 8.67
C Mon Wed 7.41 2.63 4.32
D Thu Sat 9.22 2.19 5.47
C Thu Sat 6.57 0.20 1.69
B Wed Fri 4.28 5.57 0.16
I want to avoid a loop because the actual base_df is more than 35000 rows. Is there a data.table solution? Solutions using other data structures are good too. Thank you!
Another base R solution, using mapply...
query_df <- cbind(query_df,
t(mapply(function(p,s,e) {
base_df[p, match(s, names(base_df)):match(e, names(base_df))]},
query_df$Person,
query_df$Start,
query_df$End)))
names(query_df)[4:6] <- c("D1", "D2", "D3")
query_df
Person Start End D1 D2 D3
1 A Tue Thu 0.01 6.81 8.67
2 C Mon Wed 7.41 2.63 4.32
3 D Thu Sat 9.22 2.19 5.47
4 C Thu Sat 6.57 0.2 1.69
5 B Wed Fri 4.28 5.57 0.16
The data.table solution below should be working also for varying numbers of days between Start and End days (not just 3 day periods) thanks to a non-equi join and melt() / dcast() for reshaping:
library(data.table)
setDT(base_df)
setDT(query_df)
# reshape from wide to long
long <- melt(base_df, id.vars = "Person", variable.name = "Day")
# align factor levels
cols <- c("Start", "End")
query_df[, (cols) := lapply(.SD, factor, levels = levels(long$Day)), .SDcols = cols][
# add row id because Person is not unique
, rn := .I]
# non-equi join right join, i.e., take all rows of query_df
long[query_df, on = .(Person, Day >= Start, Day <= End),
.(rn, Person, Start = i.Start, End = i.End, value)][
# reshape from long to wide
, dcast(.SD, rn + Person + ... ~ rowid(rn, prefix = "D"))]
rn Person Start End D1 D2 D3
1: 1 A Tue Thu 0.01 6.81 8.67
2: 2 C Mon Wed 7.41 2.63 4.32
3: 3 D Thu Sat 9.22 2.19 5.47
4: 4 C Thu Sat 6.57 0.20 1.69
5: 5 B Wed Fri 4.28 5.57 0.16
Note that Day is a factor with the names of weekdays as factor levels in order of appearance:
str(long)
Classes ‘data.table’ and 'data.frame': 30 obs. of 3 variables:
$ Person: chr "A" "B" "C" "D" ...
$ Day : Factor w/ 6 levels "Mon","Tue","Wed",..: 1 1 1 1 1 2 2 2 2 2 ...
$ value : num 5.23 6.26 7.41 6.17 1.23 0.01 2.19 2.63 1.5 9.01 ...
- attr(*, ".internal.selfref")=<externalptr>
Aligned factor levels are crucial for the non-equi join.
Data
library(data.table)
base_df <- fread(
"Person Mon Tue Wed Thu Fri Sat
A 5.23 0.01 6.81 8.67 0.10 6.21
B 6.26 2.19 4.28 5.57 0.16 2.81
C 7.41 2.63 4.32 6.57 0.20 1.69
D 6.17 1.50 5.30 9.22 2.19 5.47
E 1.23 9.01 8.09 1.29 7.65 4.57"
)
query_df <- fread(
"Person Start End
A Tue Thu
C Mon Wed
D Thu Sat
C Thu Sat
B Wed Fri"
)
A tidyverse answer
I reshape base_df, then join and slice the correct days, then reshape back.
library(tidyr)
library(dplyr)
base_df <- tibble::rownames_to_column(base_df, 'Person')
days <- names(base_df)[-1]
base_df %>%
gather(day, value, -Person) %>%
right_join(mutate(query_df, i = row_number())) %>%
group_by(i) %>%
slice(which(days == Start):which(days == End)) %>%
mutate(col = c('D1', 'D2', 'D3')) %>%
select(-day, -i) %>%
spread(col, value)
data.table solution:
Here I use get to extract columns (e.g. Mon) from a data.table object.
library(data.table)
# Prepare data
base_df$Person <- rownames(base_df)
d <- merge(query_df, base_df, "Person", sort = FALSE)
setDT(d)
# Extract mid day (day between start and end)
d[, Mid := days[which(Start == days) + 1], 1:nrow(d)]
# Extract columns using get
d[, .(Person, Start, End,
D1 = get(Start), D2 = get(Mid), D3 = get(End)), 1:nrow(d)][, nrow := NULL][]
Person Start End D1 D2 D3
1: A Tue Thu 0.01 6.81 8.67
2: C Mon Wed 7.41 2.63 4.32
3: D Thu Sat 9.22 2.19 5.47
4: C Thu Sat 6.57 0.20 1.69
5: B Wed Fri 4.28 5.57 0.16
Base R solution:
# Order of days
days <- names(base_df)
# Order of persons
subjects <- rownames(base_df)
res <- apply(query_df, 1, function(x) {
# Extract observation between start:end date
foo <- base_df[x[1] == subjects, which(x[2] == days):which(x[3] == days)]
colnames(foo) <- paste0("D", 1:3)
foo})
# Merge with original query_df
res <- cbind(query_df, do.call("rbind", res))
rownames(res) <- NULL
res
A base solution using indexing with a numeric matrix:
ri <- match(query_df$Person, rownames(base_df))
ci <- match(query_df$Start, names(base_df))
cbind(query_df, `dim<-`(base_df[cbind(ri, rep(ci, 3) + rep(0:2, each = nrow(query_df)))],
c(nrow(query_df), 3)))
# Person Start End 1 2 3
# 1 A Tue Thu 0.01 6.81 8.67
# 2 C Mon Wed 7.41 2.63 4.32
# 3 D Thu Sat 9.22 2.19 5.47
# 4 C Thu Sat 6.57 0.20 1.69
# 5 B Wed Fri 4.28 5.57 0.16
Related
I have a data set that will be used for time series. the date column is currently structured as follows:
> head(cam_shiller)
div stock dates
1 0.495 7.09 1933m1
2 0.490 6.25 1933m2
3 0.485 6.23 1933m3
4 0.480 6.89 1933m4
5 0.475 8.87 1933m5
6 0.470 10.39 1933m6
If I'm not mistaken, monthly data for time series should look like this: yyyy-mm. So I'm trying to make my date column look like this:
div stock dates
1 0.495 7.09 1933-01
2 0.490 6.25 1933-02
3 0.485 6.23 1933-03
4 0.480 6.89 1933-04
5 0.475 8.87 1933-05
6 0.470 10.39 1933-06
However, using the as.yearmo function produces a column full of NAs. I tried removing the 'm' and replacing it with a dash, and then running as.yearmo again. Now the results look like this:
div stock dates
1 0.495 7.09 Jan 1933
2 0.490 6.25 Feb 1933
3 0.485 6.23 Mar 1933
4 0.480 6.89 Apr 1933
5 0.475 8.87 May 1933
6 0.470 10.39 Jun 1933
How do I change the dates into the yyyy-mm format?
library(zoo)
cam_shiller = read.csv('https://raw.githubusercontent.com/bandcar/Examples/main/cam_shiller.csv')
cam_shiller$dates = gsub('m', '-', cam_shiller$dates)
cam_shiller$dates = as.yearmon(cam_shiller$dates)
Actually, in ts you just need to specify start= and frequency.
res <- ts(cam_shiller[, -3], start=1933, frequency=12)
res
# div stock
# Jan 1933 0.4950 7.09
# Feb 1933 0.4900 6.25
# Mar 1933 0.4850 6.23
# Apr 1933 0.4800 6.89
# May 1933 0.4750 8.87
# Jun 1933 0.4700 10.39
# Jul 1933 0.4650 11.23
# Aug 1933 0.4600 10.67
# Sep 1933 0.4550 10.58
# Oct 1933 0.4500 9.55
# Nov 1933 0.4450 9.78
# Dec 1933 0.4400 9.97
# Jan 1934 0.4408 10.54
# Feb 1934 0.4417 11.32
# Mar 1934 0.4425 10.74
# Apr 1934 0.4433 10.92
# May 1934 0.4442 9.81
# Jun 1934 0.4450 9.94
# Jul 1934 0.4458 9.47
# Aug 1934 0.4467 9.10
# Sep 1934 0.4475 8.88
# Oct 1934 0.4483 8.95
# Nov 1934 0.4492 9.20
# Dec 1934 0.4500 9.26
# ...
Or
ts(cam_shiller$stock, start=c(1933, 1), frequency=12)
# Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
# 1933 7.09 6.25 6.23 6.89 8.87 10.39 11.23 10.67 10.58 9.55 9.78 9.97
# 1934 10.54 11.32 10.74 10.92 9.81 9.94 9.47 9.10 8.88 8.95 9.20 9.26
# 1935 9.26 8.98 8.41 9.04 9.75 10.12 10.65 11.37 11.61 11.92 13.04 13.04
# ...
It may be wise to check beforehand that there are no gaps in the data by evaluating the column and row variances of years and month matrices:
test <- do.call(rbind, strsplit(cam_shiller$dates, 'm')) |>
type.convert(as.is=TRUE)
matrixStats::colVars(matrix(test[, 1], 12))
# [1] 0 0 ...
matrixStats::rowVars(matrix(test[, 2], 12))
# [1] 0 0 0 0 0 0 0 0 0 0 0 0
If you use the xts::xts, it's rather picky since it wants a time-based class such as "Date" or "POSIXct". So you need whole dates, i.e. paste a 01 as pseudo date.
res <- transform(cam_shiller, dates=strptime(paste(dates, '01'), format='%Ym%m %d')) |>
{\(.) xts::as.xts(.[1:2], .$dates)}()
head(res)
# div stock
# 1933-01-01 0.495 7.09
# 1933-02-01 0.490 6.25
# 1933-03-01 0.485 6.23
# 1933-04-01 0.480 6.89
# 1933-05-01 0.475 8.87
# 1933-06-01 0.470 10.39
class(res)
# [1] "xts" "zoo"
Data:
cam_shiller <- structure(list(div = c(0.495, 0.49, 0.485, 0.48, 0.475, 0.47,
0.465, 0.46, 0.455, 0.45, 0.445, 0.44, 0.4408, 0.4417, 0.4425,
0.4433, 0.4442, 0.445, 0.4458, 0.4467, 0.4475, 0.4483, 0.4492,
0.45), stock = c(7.09, 6.25, 6.23, 6.89, 8.87, 10.39, 11.23,
10.67, 10.58, 9.55, 9.78, 9.97, 10.54, 11.32, 10.74, 10.92, 9.81,
9.94, 9.47, 9.1, 8.88, 8.95, 9.2, 9.26), dates = c("1933m1",
"1933m2", "1933m3", "1933m4", "1933m5", "1933m6", "1933m7", "1933m8",
"1933m9", "1933m10", "1933m11", "1933m12", "1934m1", "1934m2",
"1934m3", "1934m4", "1934m5", "1934m6", "1934m7", "1934m8", "1934m9",
"1934m10", "1934m11", "1934m12")), row.names = c(NA, 24L), class = "data.frame")
Try lubridate::ym to change dates to yyyy-mm format
library(tidyverse)
cam_shiller = read.csv('https://raw.githubusercontent.com/bandcar/Examples/main/cam_shiller.csv')
cam_shiller %>%
mutate(
date = lubridate::ym(dates),
date = strftime(date, "%Y-%m")
) %>%
head()
#> div stock dates date
#> 1 0.495 7.09 1933m1 1933-01
#> 2 0.490 6.25 1933m2 1933-02
#> 3 0.485 6.23 1933m3 1933-03
#> 4 0.480 6.89 1933m4 1933-04
#> 5 0.475 8.87 1933m5 1933-05
#> 6 0.470 10.39 1933m6 1933-06
Created on 2022-10-01 with reprex v2.0.2
The form in the question is already correct. It is not true
that you need to change it. It renders as Jan 1933, etc. but internally it is represented as year+(month-1)/12 (where month is a number 1, 2, ..., 12) which is exactly what you need for analysis. You do not want a character string of the form yyyy-mm for analysis.
If by "time series" you mean a zoo series then using u defined in the Note at the end, z below gives that with a yearmon index. The index argument to read.csv.zoo gives the column number or name of the index, the FUN argument tells it how to convert it and the format argument tells it the precise form of the dates.
If what you mean by time series is that you want a ts series then tt below gives that.
If what you mean is a data frame with a yearmon column then DF below gives that.
With either a zoo series or a ts series one could perform a variety of analyses. For example, acf(z) or acf(tt) would give the autocorrelation function.
For more information see ?read.csv.zoo . There is also an entire vignette on read.zoo and its variants. The vignettes are linked to on the CRAN home page for zoo. Also see ?strptime for the percent codes.
library(zoo)
# zoo series with yearmon column
z <- read.csv.zoo(u, index = 3, FUN = as.yearmon, format = "%Ym%m")
# ts series
tt <- as.ts(z)
# data frame with yearmon column
DF <- u |>
read.csv() |>
transform(dates = as.yearmon(dates, "%Ym%m"))
A character string of the form yyyy-mm is not a suitable form for most analyses but if you really did want that anyways then
# zoo series with yyyy-mm character string index
z2 <- aggregate(z, format(index(z), "%Y-%m"), c)
# data.frame with yyyy-mm character string column
DF2 <- transform(DF, dates = format(dates, "%Y-%m"))
Note
u <- "https://raw.githubusercontent.com/bandcar/Examples/main/cam_shiller.csv"
This question already has answers here:
How to reshape data from long to wide format
(14 answers)
Closed 2 years ago.
I get always a data table of this format:
set.seed(123)
dt <- data.table(date = seq(as.Date('2020-01-01'), by = '1 day', length.out = 365),
Germany = rnorm(365, 2, 1), check.names = FALSE)
The data table dt is only a counterexample. Every day of a month has a value (price), but it is also possible that a day has an NA-value.
... can be understood as any value.
What I want to do is, to change the format of this data table into the following form (colors can be ignored):
The first column defines the months of the year and all other columns are declaring the dates, e.g. price on the 2nd Jan is 2.11 or on the 31th Sept is 1.78. How can I do this?
Extract month and date in separate column and get the data in wide format.
library(dplyr)
dt %>%
mutate(month = format(date, '%b'),
date = format(date, '%d')) %>%
tidyr::pivot_wider(names_from = date, values_from = Germany)
If you want to do this in data.table you can use dcast.
library(data.table)
dcast(dt[,`:=`(month = format(date, '%b'), date = format(date, '%d'))],
month~date, value.var = 'Germany')
Here is another way to solve it using data.table.
Note that month.abb is a built-in variable in r.
Using factor function and month.abb as levels allows to sort the months properly.
library(data.table)
dcast(dt[, month := factor(months(date, abbr = TRUE), month.abb)],
month ~ mday(date), value.var = 'Germany')
# month 1 2 3 4 5 6 7 8 9 10 11 12 ...
# 1: Jan 1.44 1.77 3.56 2.07 2.13 3.72 2.46 0.73 1.31 1.55 3.22 2.36
# 2: Feb 1.70 2.90 2.88 2.82 2.69 2.55 1.94 1.69 1.62 1.31 1.79 0.73
# 3: Mar 2.38 1.50 1.67 0.98 0.93 2.30 2.45 2.05 2.92 4.05 1.51 -0.31
# ...
I am struggling with this issue that I am unable to fix! I have to data-frames ("Master" and " "Hours"). The 'Master' df has many columns but the ones in particular are as follows below:
Master
StoreNumber ... MON TUE WED THU FRI SAT SUN
1 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0
3 0 0 0 0 0 0 0
...
NB: The Master df has many columns in between StoreNumber as the days of the week and holds a lot of data (about 3000 stores)
Hours
BranchNumber Day TimeDiff
1 MON 7.50
1 TUE 6.00
1 WED 8.50
1 THU 2.00
1 FRI 1.00
1 SAT 2.50
3 MON 7.50
3 TUE 6.00
3 WED 8.50
3 THU 2.00
3 FRI 1.00
3 SAT 2.50
3 SUN 5.00
...
So the idea is that I am trying to match the 'Hours' BrandNumber with the 'Master' StoreNumber. Once there is a match then it matched the Day column from the 'Hours' table with the Days of the week in the 'Masters' Table...It will do this for each row and then populate days of the week with the corresponding value in 'TimeDiff' column...if the store and branch number has no match (Like StoreNumber 2) then it should skip that row and move onto the next. Another condition, like BranchNumber '1' there is no data for SUNDAY so in the 'Master' table the SUNDAY cell should be left as 0...this should work for any day of the week.
The output should be the 'Master' Table but complete with all the days of the week data from the 'Hours' Table. In this example, it should look like:
StoreNumber ... MON TUE WED THU FRI SAT SUN
1 7.50 6.00 8.50 2.00 1.00 2.50 0
2 0 0 0 0 0 0 0
3 7.50 6.00 8.50 2.00 1.00 2.50 5.00
...
The code I have tried is semi-working but I am not sure if its the correct approach. The biggest problem I am getting is that its duplicating the rows expect from the first row. For instance, the output looks more like this.
StoreNumber
1
2
2
3
3
4
4
5
5
5
all are duplicating and some tripling and every 87 columns are identical...however the days of the week of the duplicated row are all 0's.
merged <- Master %>% select(-c("MON","TUE","WED","THU","FRI","SAT","SUN")) %>%
left_join(
Hours %>% pivot_wider(names_from = Day, values_from = TimeDiff),
by = c('StoreNumber' = 'BranchNumber'))
merged <- merged %>% replace(is.na(.),0)
Sorry for the long question, this issue has been bugging me a while so any help/advice will be grateful
If I understand correctly, the Master table has many columns and only the columns MON to SUN need to be updated.
Here are two approaches which uses data.table's ability to update in a join. Only the relevant columns are modified by reference, i.e., without copying the whole data object. It avoids to reshape (or pivot) the Master table forth and back.
Variant 1
library(data.table)
days <- names(Master)[which(names(Master) == "MON") + (0:6)]
setDT(Master)[, (days) := lapply(.SD, as.double), .SDcols = days]
for (d in days) {
Master[Hours, on =.(StoreNumber = BranchNumber), (d) := TimeDiff[d == Day], by = .EACHI]
}
Master[]
StoreNumber OtherCol MON TUE WED THU FRI SAT SUN
1: 1 a 7.5 6 8.5 2 1 2.5 0
2: 2 b 0.0 0 0.0 0 0 0.0 0
3: 3 c 7.5 6 8.5 2 1 2.5 5
Explanation
days contains the names of the columns.
days <- names(Master)[which(names(Master) == "MON") + (0:6)] is equivalent to
days <- c("MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN")
data.table requires consistent data types when parts of a column are updated. The day columns in Master are initialized to integer zero but TimeDiff in Hours is numeric. Therefore, the day columns in Master are coerced to double before updating.
The for loop iterates over each day column and performs the update join for this column. For each match (by = .EACHI), the Timediff for the relevant day is picked.
In order to verify that Master has not been copied we can call
data.table::address(Master)
before and after the operation: The address of Master has not changed.
Variant 2
This approach is a bit leaner. It also uses an update join but it is different to variant 1 as it reshapes (or pivots) Hours from long to wide format and removes the days columns from Master instead of coercing a bunch of integer zeroes to type numeric:
library(data.table)
days <- c("MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN")
Hours_wide <- dcast(setDT(Hours)[, Day := ordered(Day, levels = days)], BranchNumber ~ Day)
setDT(Master)[, (days) := NULL][
Hours_wide, on = .(StoreNumber = BranchNumber), (days) := mget(paste0("i.", days))]
Master[]
StoreNumber OtherCol MON TUE WED THU FRI SAT SUN
1: 1 a 7.5 6 8.5 2 1 2.5 NA
2: 2 b NA NA NA NA NA NA NA
3: 3 c 7.5 6 8.5 2 1 2.5 5
Note that missing elements are now initialized to / indicated by NA which is much easier to detect, IMHO. If required, the NAs can be turned into another numeric value by
Master[, (days) := lapply(.SD, nafill, fill = 0), .SDcols = days][]
StoreNumber OtherCol MON TUE WED THU FRI SAT SUN
1: 1 a 7.5 6 8.5 2 1 2.5 0
2: 2 b 0.0 0 0.0 0 0 0.0 0
3: 3 c 7.5 6 8.5 2 1 2.5 5
This approach uses mget(paste0("i.", days)) to pick the days columns from Hours. If there are columns with the same name in both data.tables in a join, we can distinguish the columns by prepending the column names by a x. and i., resp. Thus, x.MON refers to the MON column from the first data.table which is Master in this case and i.MON refers to the MON column from the second data.table which is Hours_wide. mget() takes the column names as character strings and returns a list of the values of the respective columns.
Variant 2 - Edit 1
Above code can be simplified by
setDT(Master)[, (days) := NULL][
Hours_wide, on = .(StoreNumber = BranchNumber), (days) := mget(days)][]
StoreNumber OtherCol MON TUE WED THU FRI SAT SUN
1: 1 a 7.5 6 8.5 2 1 2.5 NA
2: 2 b NA NA NA NA NA NA NA
3: 3 c 7.5 6 8.5 2 1 2.5 5
Because setDT(Master)[, (days) := NULL] already has removed the columns MON to SUN from Master there is no ambiguity on column names. Thus, the column names MON to SUN can be used without prepending them by i. as the only columns named MON to SUN are in Hours_wide.
Variant 2 - Edit 2
With development version 1.14.1 as of 2021-05-10, a new interface for programming on data.table has been added (see item 10 in NEWS and the new vignette programming on data.table). Instead of get()/ mget() the new env argument is recommended:
library(data.table) # development version 1.14.1 used
days <- c("MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN")
Hours_wide <- dcast(setDT(Hours)[, Day := ordered(Day, levels = days)], BranchNumber ~ Day)
setDT(Master)[, (days) := NULL][
Hours_wide, on = .(StoreNumber = BranchNumber), (days) := s,
env = list(s = as.list(days))][]
StoreNumber OtherCol MON TUE WED THU FRI SAT SUN
1: 1 a 7.5 6 8.5 2 1 2.5 NA
2: 2 b NA NA NA NA NA NA NA
3: 3 c 7.5 6 8.5 2 1 2.5 5
Variant 3: env parameter and fcoalesce()
OP's expected result shows 0 instead of NA. With the Variants 2 above, this was was achieved by a separate update step using nafill().
This separate update step can be avoided by using the fcoalesce() function in the update join:
library(data.table) # development version 1.14.1 used
days <- c("MON", "TUE", "WED", "THU", "FRI", "SAT", "SUN")
Hours_wide <- dcast(setDT(Hours)[, Day := ordered(Day, levels = days)], BranchNumber ~ Day)
setDT(Master)[, (days) := lapply(.SD, as.double), .SDcols = days][
Hours_wide, on = .(StoreNumber = BranchNumber), (days) := lapply(s, fcoalesce, 0),
env = list(s = as.list(paste0("i.", days)))][]
StoreNumber OtherCol MON TUE WED THU FRI SAT SUN
1: 1 a 7.5 6 8.5 2 1 2.5 0
2: 2 b 0.0 0 0.0 0 0 0.0 0
3: 3 c 7.5 6 8.5 2 1 2.5 5
Data
library(data.table)
Master <- fread("
StoreNumber OtherCol MON TUE WED THU FRI SAT SUN
1 a 0 0 0 0 0 0 0
2 b 0 0 0 0 0 0 0
3 c 0 0 0 0 0 0 0
", data.table = FALSE)
Hours <- fread("
BranchNumber Day TimeDiff
1 MON 7.50
1 TUE 6.00
1 WED 8.50
1 THU 2.00
1 FRI 1.00
1 SAT 2.50
3 MON 7.50
3 TUE 6.00
3 WED 8.50
3 THU 2.00
3 FRI 1.00
3 SAT 2.50
3 SUN 5.00
", data.table = FALSE)
Based on #GregorThomas comments, here is a longer then wider approach:
master <- data.frame(
StoreNumber = 1:3,
MON = 0,
TUE = 0,
WED = 0,
THU = 0,
FRI = 0,
SAT = 0,
SUN = 0
)
hours <- read.table(text = "BranchNumber Day TimeDiff
1 MON 7.50
1 TUE 6.00
1 WED 8.50
1 THU 2.00
1 FRI 1.00
1 SAT 2.50
3 MON 7.50
3 TUE 6.00
3 WED 8.50
3 THU 2.00
3 FRI 1.00
3 SAT 2.50
3 SUN 5.00", header = TRUE)
library(dplyr)
library(tidyr)
master %>%
pivot_longer(
cols = MON:SUN,
names_to = "Day",
values_to = "Time"
) %>%
left_join(hours, by = c("StoreNumber" = "BranchNumber", "Day")) %>%
mutate(TimeDiff = replace_na(TimeDiff, 0),
Time = TimeDiff) %>%
select(-TimeDiff) %>%
pivot_wider(
id_cols = StoreNumber,
names_from = Day,
values_from = Time
)
# A tibble: 3 x 8
StoreNumber MON TUE WED THU FRI SAT SUN
<int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 7.5 6 8.5 2 1 2.5 0
2 2 0 0 0 0 0 0 0
3 3 7.5 6 8.5 2 1 2.5 5
Edit
Here is a version where master has additional columns and the output is stored:
master <- data.frame(
StoreNumber = 1:3,
other_colum = c("A", "B", "C"),
MON = 0,
TUE = 0,
WED = 0,
THU = 0,
FRI = 0,
SAT = 0,
SUN = 0
)
hours <- read.table(text = "BranchNumber Day TimeDiff
1 MON 7.50
1 TUE 6.00
1 WED 8.50
1 THU 2.00
1 FRI 1.00
1 SAT 2.50
3 MON 7.50
3 TUE 6.00
3 WED 8.50
3 THU 2.00
3 FRI 1.00
3 SAT 2.50
3 SUN 5.00", header = TRUE)
library(dplyr)
library(tidyr)
master <- master %>%
pivot_longer(
cols = MON:SUN,
names_to = "Day",
values_to = "Time"
) %>%
left_join(hours, by = c("StoreNumber" = "BranchNumber", "Day")) %>%
mutate(TimeDiff = replace_na(TimeDiff, 0),
Time = TimeDiff) %>%
select(-TimeDiff) %>%
pivot_wider(
names_from = Day,
values_from = Time
)
master
# A tibble: 3 x 9
StoreNumber other_colum MON TUE WED THU FRI SAT SUN
<int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 A 7.5 6 8.5 2 1 2.5 0
2 2 B 0 0 0 0 0 0 0
3 3 C 7.5 6 8.5 2 1 2.5 5
I have a dataframe that is essentially a time series data.
Timestamp <- c("1/27/2015 18:28:16","1/27/2015 18:28:17","1/27/2015 18:28:19","1/27/2015 18:28:20","1/27/2015 18:28:23","1/28/2015 22:43:08","1/28/2015 22:43:09","1/28/2015 22:43:13","1/28/2015 22:43:15","1/28/2015 22:43:16"
)
ID <- c("A","A","A","A","A","B","B","B","B","B")
v1<- c(1.70,1.71,1.77,1.79,1.63,7.20,7.26,7.16,7.18,7.18)
df <- data.frame(Timestamp ,ID,v1)
Timestamp ID v1
1/27/2015 18:28:16 A 1.70
1/27/2015 18:28:17 A 1.71
1/27/2015 18:28:19 A 1.77
1/27/2015 18:28:20 A 1.79
1/27/2015 18:28:23 A 1.63
1/28/2015 22:43:08 B 7.20
1/28/2015 22:43:09 B 7.26
1/28/2015 22:43:13 B 7.16
1/28/2015 22:43:15 B 7.18
1/28/2015 22:43:16 B 7.18
Since I dont really care about the timestamp, I was thinking of creating a column called interval to plot this data in one plot.
I am wrongly creating the interval column by doing this
df$interval <- cut(df$Timestamp, breaks="sec")
I want to incrementally add the "secs" of the timestamp and put it in the interval column and this should by grouped by ID. By this I mean, Everytime it has a new ID, the interval column resets to 1 and then incrementally adds the timestamp (secs).
My desired output
Timestamp ID v1 Interval
1/27/2015 18:28:16 A 1.70 1
1/27/2015 18:28:17 A 1.71 2
1/27/2015 18:28:19 A 1.77 4
1/27/2015 18:28:20 A 1.79 5
1/27/2015 18:28:23 A 1.63 8
1/28/2015 22:43:08 B 7.20 1
1/28/2015 22:43:09 B 7.26 2
1/28/2015 22:43:13 B 7.16 6
1/28/2015 22:43:15 B 7.18 8
1/28/2015 22:43:16 B 7.18 9
I also would like to plot this using ggplot with interval vs v1 by ID and so we get 2 time series in the same plot. I will then extract features from it.
Please help me how to work around this problem so that I can apply it to a larger dataset.
One solution with data.table:
For the data:
library(data.table)
df <- as.data.table(df)
df$Timestamp <- as.POSIXct(df$Timestamp, format='%m/%d/%Y %H:%M:%S')
df[, Interval := as.numeric(difftime(Timestamp, .SD[1, Timestamp], units='secs') + 1) , by=ID]
which outputs:
> df
Timestamp ID v1 Interval
1: 2015-01-27 18:28:16 A 1.70 1
2: 2015-01-27 18:28:17 A 1.71 2
3: 2015-01-27 18:28:19 A 1.77 4
4: 2015-01-27 18:28:20 A 1.79 5
5: 2015-01-27 18:28:23 A 1.63 8
6: 2015-01-28 22:43:08 B 7.20 1
7: 2015-01-28 22:43:09 B 7.26 2
8: 2015-01-28 22:43:13 B 7.16 6
9: 2015-01-28 22:43:15 B 7.18 8
10: 2015-01-28 22:43:16 B 7.18 9
Then for ggplot:
library(ggplot2)
ggplot(df, aes(x=Interval, y=v1, color=ID)) + geom_line()
and the graph:
I have a sample data as follows:
data1 <- read.table(text="1/1/12 1:48 AM 1.24
1/1/12 8:14 AM 0.26
1/1/12 2:01 PM 1.15
1/1/12 8:25 PM 0.15
1/2/12 2:36 AM 1.23
1/2/12 9:13 AM 0.25
1/2/12 2:54 PM 1.09
1/2/12 9:17 PM 0.16
1/3/12 3:28 AM 1.24
1/3/12 10:06 AM 0.21
1/3/12 3:52 PM 1.07
1/3/12 10:05 PM 0.15
1/4/12 4:21 AM 1.27
1/4/12 10:56 AM 0.16
1/4/12 4:49 PM 1.08
1/4/12 10:52 PM 0.12
1/5/12 5:12 AM 1.32
1/5/12 11:43 AM 0.1
1/5/12 5:41 PM 1.12
1/5/12 11:37 PM 0.08
1/6/12 5:58 AM 1.38
1/6/12 12:28 PM 0.03
1/6/12 6:27 PM 1.17
", sep="", header=F)
> head(data1)
V1 V2 V3 V4 date
1 1/1/12 1:48 AM 1.24 1/1/12 1:48 AM
2 1/1/12 8:14 AM 0.26 1/1/12 8:14 AM
3 1/1/12 2:01 PM 1.15 1/1/12 2:01 PM
4 1/1/12 8:25 PM 0.15 1/1/12 8:25 PM
5 1/2/12 2:36 AM 1.23 1/2/12 2:36 AM
6 1/2/12 9:13 AM 0.25 1/2/12 9:13 AM
Combine 3 columns to one to make data column
data1$date <- paste(data1$V1, data1$V2, data1$V3)
Create a date sequence to do the interpolation
daterange <- seq(from=as.POSIXct("2012-1-1 00:00"), to = as.POSIXct("2012-1-6 00:00"), length.out =1200)
I want to find the corresponding V4 values of the daterange specified above. I want to do the linear interpolation.
As others have said, you can use approx(...) to interpolate between successive points, although it's debatable if this is a good idea.
data1$posix <- as.POSIXct(data1$date,format="%m/%d/%y %I:%M %p")
df <- as.data.frame(with(data1,approx(posix,V4,n=1200))) # colnames are "x", "y"
colnames(df) <- c("date","V4")
df$posix <- as.POSIXct(df$date,origin="1970-01-01")
library(ggplot2)
ggplot()+
geom_point(data=data1, aes(x=posix, y=V4), color="red", size=5)+
geom_point(data=df, aes(x=posix, y=V4), color="blue", size=1)+
labs(x="Date")
Note the format string in the call to as.POSIXct(...). You have to specify that the times are in 12hr format using %I (not %H), and you have to specify that the string contains AM/PM (using %p), or your character times will not convert correctly. (They will convert, though, without throwing an error - so be careful).