Having a dataframe like this:
data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
How is it possible to add new columns which have the time difference in months for every row and whete current insert the "Sept 2022".
Example output:
data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"), time_stamp_1_duration = c(41,43,24,53), time_stamp_2_duration = c(32,12,45,32))
duration is an example only it is not the real, just for example.
This should do the trick. First replace all the "Current" and "Sept" with the R-recognized abbreviation "Sep", then use tidy::separate and zoo::as.yearmon() to convert to year-month format, then calculate the intervals (in months (x12) per OP):
library(tidyr)
library(zoo)
df <- data.frame(id = c(1,2,3,4), time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"), time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
# convert current and Sept to "Sep 2022"
df[2:3] <- lapply(df[2:3], function(x) gsub("-Current|-Sept 2022", "-Sep 2022", x))
df %>%
separate(time_stamp_1, into = c("my1a", "my1b"), sep = "-") %>%
separate(time_stamp_2, into = c("my2a", "my2b"), sep = "-") %>%
mutate(across(my1a:my2b, ~ as.yearmon(.x, format = "%b %Y"))) %>%
mutate(interval_1 = (my1b - my1a) * 12,
interval_2 = (my2b - my2a) * 12) %>%
left_join(df) %>% select(names(df), "interval_1", "interval_2")
Output:
id time_stamp_1 time_stamp_2 interval_1 interval_2
1 1 Nov 2016-Sep 2022 Mar 2015-Nov 2016 70 20
2 2 May 2013-Sep 2022 May 2008-May 2013 112 60
3 3 Oct 2015-Sep 2022 Aug 2005-Sep 2022 83 205
4 4 May 2014-Sep 2022 Oct 2014-Jan 2015 100 3
As G. Grothendieck mentions in the comments, we could wrap this in a function:
# thanks to G. Grothendieck
ts2mos <- function(x) {
x <- gsub("-Current|-Sept 2022", "-Sep 2022", x)
12 * (as.yearmon(sub(".*-", "", x)) - as.yearmon(x, "%b %Y"))
}
df %>% mutate(interval_1 = ts2mos(time_stamp_1),
interval_2 = ts2mos(time_stamp_2))
Using tidyverse
library(dplyr)
library(lubridate)
library(stringr)
df1 %>%
mutate(across(starts_with('time_stamp'), ~ {
tmp <- str_replace(.x, "Current", 'Sep 2022') %>%
str_replace("(\\w+) (\\d+)-(\\w+) (\\d+)", "\\2-\\1-01/\\4-\\3-01") %>%
interval
tmp %/% months(1)}, .names = "{.col}_duration"))
-output
id time_stamp_1 time_stamp_2 time_stamp_1_duration time_stamp_2_duration
1 1 Nov 2016-Current Mar 2015-Nov 2016 70 20
2 2 May 2013-Current May 2008-May 2013 112 60
3 3 Oct 2015-Current Aug 2005-Current 83 205
4 4 May 2014-Current Oct 2014-Jan 2015 100 3
A tidyverse approach
library(dplyr)
library(lubridate)
library(stringr)
df %>%
mutate(across(starts_with("time_stamp"), ~ str_replace(.x, "Current", "Sep 2022")),
time_stamp_1_duration = sapply(str_split(time_stamp_1, "-"), function(x)
interval(my(x[1]), my(x[2])) %/% months(1)),
time_stamp_2_duration = sapply(str_split(time_stamp_2, "-"), function(x)
interval(my(x[1]), my(x[2])) %/% months(1)),
across(starts_with("time_stamp"), ~ str_replace(.x, "Sep 2022", "Current")))
id time_stamp_1 time_stamp_2 time_stamp_1_duration
1 1 Nov 2016-Current Mar 2015-Nov 2016 70
2 2 May 2013-Current May 2008-May 2013 112
3 3 Oct 2015-Current Aug 2005-Current 83
4 4 May 2014-Current Oct 2014-Jan 2015 100
time_stamp_2_duration
1 20
2 60
3 205
4 3
library(stringr)
timespan_to_duration <- function(x) {
x[ x == 'Current' ] <- 'Sep 2022'
x <- str_replace_all(x, '\\s+', ' 01 ')
x <- as.POSIXct(x, format = '%b %d %Y')
((difftime(x[ 2 ], x[ 1 ], units = 'days') |>
as.integer()) / 30) |>
round()
}
df <- data.frame(id = c(1,2,3,4),
time_stamp_1 = c("Nov 2016-Current", "May 2013-Current", "Oct 2015-Current", "May 2014-Current"),
time_stamp_2 = c("Mar 2015-Nov 2016", "May 2008-May 2013", "Aug 2005-Current", "Oct 2014-Jan 2015"))
df$time_stamp_1_duration <- df$time_stamp_1 |>
str_split('-') |>
lapply(timespan_to_duration) |>
unlist()
df$time_stamp_2_duration <- df$time_stamp_2 |>
str_split('-') |>
lapply(timespan_to_duration) |>
unlist()
df
One possible solution using the function tstrsplit from data.table package. Not that I am also using the built-in constant month.abb.
df[c("duration1", "duration2")] = lapply(df[2:3], function(x) {
x = data.table::tstrsplit(sub("Current", "Sep 2022", x),
split="\\s|-",
type.convert=TRUE,
names=c("mo1", "yr1", "mo2", "yr2"))
x[c("mo1", "mo2")] = lapply(x[c("mo1", "mo2")], match, month.abb)
pmax(x$yr2 - x$yr1-1, 0) * 12 + 12-x$mo1 + x$mo2
})
id time_stamp_1 time_stamp_2 duration1 duration2
1 1 Nov 2016-Current Mar 2015-Nov 2016 70 20
2 2 May 2013-Current May 2008-May 2013 112 60
3 3 Oct 2015-Current Aug 2005-Current 83 205
4 4 May 2014-Current Oct 2014-Jan 2015 100 3
Related
I have columns like these:
year period period2 Sales
2015 201504 April 2015 10000
2015 201505 May 2015 11000
2018 201803 March 2018 12000
I want to change the type of period or period2 column as a date, to use later in time series analysis
Data:
tibble::tibble(
year = c(2015,2015,2018),
period = c(201504, 201505,201803 ),
period2 = c("April 2015", "May 2015", "March 2018"),
Sales = c(10000,11000,12000)
)
Using lubridate package you can transform them into date variables:
df <- tibble::tibble(
year = c(2015,2015,2018),
period = c(201504, 201505,201803 ),
period2 = c("April 2015", "May 2015", "March 2018"),
Sales = c(10000,11000,12000)
)
library(dplyr)
df %>%
mutate(period = lubridate::ym(period),
period2 = lubridate::my(period2))
I am new to R and am working on an assignment where I import some JSON data to (1) create a time series graph and (2) decompose the time-series. It's the decompose function where I'm struggling. Here is what works...
# Import JSON & convert to data.frame
aor <- fromJSON.....
aor <- as.data.frame......
# Combine the year and month into a date format
aor$date <- as.yearmon(paste(aor$year, aor$month), "%Y %m")
# Ensure data is float not chr
aor$mwh <- as.numeric(aor$mwh)
# Prep the data.frame for time-series analysis by converting to xts
aor <- xts(x = aor, order.by = aor$date)
# Successfully output a time-series graph.
dygraph(aor)
Here is a sample of aor up to this point...
> aor
mwh date
Jan 2001 " 1.42000" "Jan 2001"
Feb 2001 " 1.28400" "Feb 2001"
Mar 2001 " 1.25800" "Mar 2001"
Apr 2001 " 1.53600" "Apr 2001"
May 2001 " 1.47100" "May 2001"
Jun 2001 " 1.91800" "Jun 2001"
Jul 2001 " 2.37800" "Jul 2001"
> dput(head(aor, 10))
structure(c(" 1.42000", " 1.28400", " 1.25800", " 1.53600",
" 1.47100", " 1.91800", " 2.37800", " 2.47000", " 1.65100",
" 1.58100", "Jan 2001", "Feb 2001", "Mar 2001", "Apr 2001",
"May 2001", "Jun 2001", "Jul 2001", "Aug 2001", "Sep 2001", "Oct 2001"
), .Dim = c(10L, 2L), .Dimnames = list(NULL, c("mwh", "date")))
The code I thought would produce the decomposition graphic...
ts <- as.ts(aor)
> ts
mwh date
Jan 1 1.42000 Jan 2001
Feb 1 1.28400 Feb 2001
Mar 1 1.25800 Mar 2001
Apr 1 1.53600 Apr 2001
May 1 1.47100 May 2001
Jun 1 1.91800 Jun 2001
Jul 1 2.37800 Jul 2001
d <- decompose(ts)
plot(d)
I get this error when trying to decompose ts...
Error in `-.default`(x, trend) : non-numeric argument to binary operator
edit: See solution at the bottom.
I have trouble using the outcome of an which.max outcome in a list.
Below follows an example which reproduces my problem.
Create dataframe
library(dplyr)
library(ggplot2)
library(forcats)
name <- c('A','A','A', 'A','A','A', 'A','A','A',
'B','B','B', 'B','B','B', 'B','B','B',
'C','C','C', 'C','C','C', 'C','C','C')
month = c("oct 2018", "oct 2018", "oct 2018","nov 2018", "nov 2018", "nov 2018","dec 2018", "dec 2018", "dec 2018",
"oct 2018", "oct 2018", "oct 2018","nov 2018", "nov 2018", "nov 2018","dec 2018", "dec 2018", "dec 2018" ,
"oct 2018", "oct 2018", "oct 2018","nov 2018", "nov 2018", "nov 2018","dec 2018", "dec 2018", "dec 2018" )
value <- seq(1:length(month))
df = data.frame(name, month, value)
df
Outcome
name month value
A oct 2018 1
A oct 2018 2
A oct 2018 3
A nov 2018 4
A nov 2018 5
A nov 2018 6
A dec 2018 7
A dec 2018 8
A dec 2018 9
B oct 2018 10
B oct 2018 11
B oct 2018 12
B nov 2018 13
B nov 2018 14
B nov 2018 15
B dec 2018 16
B dec 2018 17
B dec 2018 18
C oct 2018 19
C oct 2018 20
C oct 2018 21
C nov 2018 22
C nov 2018 23
C nov 2018 24
C dec 2018 25
C dec 2018 26
C dec 2018 27
Extract name of the observation with the largest value
memberLargestValue = df[which.max(df$value),]$name
memberLargestValue
Outcome
[1] C
Levels: A B C
Merge memberLargestValue with pre-existing list
oldList = c("A", "A")
newList = c(oldList, memberLargestValue)
newList
Outcome
[1] "A" "A" "3"
I do not want the "3" in the above list, but I want "C" instead. Does anybody know how I can acccess the "C" in "memberLargestValue" and get it into the list?
Solution:
Change to "character" type:
memberLargestValue = as.character(df[which.max(df$value),]$name)
edit: I rewrote the whole post including an example that is possible to replicate directly, and also containing the solution provided by Paweł Chabros. Thank you Paweł Chabros for providing a very neat answer!
In the following picture I struggle reversing the order of the box-plots, wanting to change it to go from October to December when looking left to right:
Click here to display plot
The dataframe is created by
library(dplyr)
library(ggplot2)
library(forcats)
name <- c('A','A','A', 'A','A','A', 'A','A','A',
'B','B','B', 'B','B','B', 'B','B','B',
'C','C','C', 'C','C','C', 'C','C','C')
month = c("oct 2018", "oct 2018", "oct 2018","nov 2018", "nov 2018", "nov 2018","dec 2018", "dec 2018", "dec 2018",
"oct 2018", "oct 2018", "oct 2018","nov 2018", "nov 2018", "nov 2018","dec 2018", "dec 2018", "dec 2018" ,
"oct 2018", "oct 2018", "oct 2018","nov 2018", "nov 2018", "nov 2018","dec 2018", "dec 2018", "dec 2018" )
value <- seq(1:length(month))
df = data.frame(name, month, value)
df
The data frame looks like this
name month value
A oct 2018 1
A oct 2018 2
A oct 2018 3
A nov 2018 4
A nov 2018 5
A nov 2018 6
A dec 2018 7
A dec 2018 8
A dec 2018 9
B oct 2018 10
B oct 2018 11
B oct 2018 12
B nov 2018 13
B nov 2018 14
B nov 2018 15
B dec 2018 16
B dec 2018 17
B dec 2018 18
C oct 2018 19
C oct 2018 20
C oct 2018 21
C nov 2018 22
C nov 2018 23
C nov 2018 24
C dec 2018 25
C dec 2018 26
C dec 2018 27
The plot in the figure above is created by
wantedMonths = c("oct 2018", "nov 2018", "dec 2018")
wantedNames = c("A", "B")
df2= df[df$name %in% wantedNames, ]
ggplot(df2[df2$month %in% wantedMonths , ]) + geom_boxplot(aes(as.factor(name), value, fill=month))#fct_rev(month)
The command that creates the correct plot, which was provided by Paweł Chabros, is
ggplot(df2[df2$month %in% wantedMonths , ]) + geom_boxplot(aes(as.factor(name), value, fill=fct_rev(month)))
ggplot uses the order of the factor for this purpose. You can set month as ordered factor either inside ggplot call or change it before, in the data. In that case just add the following line before ggplot call:
df[['month']] = ordered(df[['month']], levels = c('oct 2018', 'nov 2018', 'dec 2018'))
If your problem is the ordering of the bar you can set them manually by scale_colour_manual function.
Just add the this while plotting with ggplot.
scale_colour_manual(values = c("red","green","blue"))
The answer, which is also included in the edited question, is to use fct_rev:
ggplot(df2[df2$month %in% wantedMonths , ]) + geom_boxplot(aes(as.factor(name), value, fill=fct_rev(month)))
I have data stored as a collection of data-frames named A,B,C,...
A <- structure(list(Mes = c("Feb 2015", "Jun 2015", "Jul 2015", "Aug 2015", "Sep 2015", "Oct 2015", "Nov 2015"), Valor = c(73.98, 1013.21, 3209.38, 4168.41, 5233.48, 1313.83, 622.78)), .Names = c("Mes","Valor"), class = "data.frame", row.names = c("1", "2", "3","4", "5", "6", "7"))
B <- structure(list(Mes = c("Aug 2015", "Sep 2015", "Oct 2015"), Valor = c(1391.72, 761.15, 107.58)), .Names = c("Mes", "Valor"), class = "data.frame", row.names = c("1", "2", "3"))
C <- structure(list(Mes = c("Aug 2015"), Valor = c(0)), .Names = c("Mes", "Valor"), class = "data.frame", row.names = c("1"))
Note: C is being used to represent an empty data-frame
The problem is to build a matrix like:
"Feb 2015" "Jun 2015" "Jul 2015" "Aug 2015" "Sep 2015" "Oct 2015" "Nov 2015"
"73.98" "1013.21" "3209.38" "4168.41" "5233.48" "1313.83" "622.78"
0 0 0 "1391.72" "761.15" "107.58" 0
0 0 0 0 0 0 0
We can place the datasets in a list, use rbindlist from data.table to create a single data.table, and reshape from 'long' to 'wide' using dcast
library(data.table)#v1.9.6+
dcast(rbindlist(list(A,B))[, ind:= 1:.N, Mes], ind~Mes,
value.var='Valor', fill=0)
# ind Aug 2015 Feb 2015 Jul 2015 Jun 2015 Nov 2015 Oct 2015 Sep 2015
#1: 1 4168.41 73.98 3209.38 1013.21 622.78 1313.83 5233.48
#2: 2 1391.72 0.00 0.00 0.00 0.00 107.58 761.15
In the devel version i.e. v1.9.7, we can also use rowid to create the sequence column
dcast(rbindlist(list(A, B)), rowid(Mes)~Mes, value.var='valor', fill=0)
Update
Using the new datasets
DT <- rbindlist(list(A, B,C))[, ind:= 1:.N, Mes]
dcast(DT, ind~Mes, value.var='Valor', fill=0)
# ind Aug 2015 Feb 2015 Jul 2015 Jun 2015 Nov 2015 Oct 2015 Sep 2015
#1: 1 4168.41 73.98 3209.38 1013.21 622.78 1313.83 5233.48
#2: 2 1391.72 0.00 0.00 0.00 0.00 107.58 761.15
#3: 3 0.00 0.00 0.00 0.00 0.00 0.00 0.00
If we need a matrix output, this can be also done with base R options. We create the sequence column using ave and then reshape with xtabs.
df1 <- transform(do.call(rbind, list(A, B)), ind= ave(seq_along(Mes), Mes, FUN=seq_along))
xtabs(Valor~ind+Mes, df1)
# Mes
#ind Aug 2015 Feb 2015 Jul 2015 Jun 2015 Nov 2015 Oct 2015 Sep 2015
# 1 4168.41 73.98 3209.38 1013.21 622.78 1313.83 5233.48
# 2 1391.72 0.00 0.00 0.00 0.00 107.58 761.15
data
A <- structure(list(Mes = c("Feb 2015", "Jun 2015", "Jul 2015", "Aug 2015",
"Sep 2015", "Oct 2015", "Nov 2015"), Valor = c(73.98, 1013.21,
3209.38, 4168.41, 5233.48, 1313.83, 622.78)), .Names = c("Mes",
"Valor"), class = "data.frame", row.names = c("1", "2", "3",
"4", "5", "6", "7"))
B <- structure(list(Mes = c("Aug 2015", "Sep 2015", "Oct 2015"), Valor = c(1391.72,
761.15, 107.58)), .Names = c("Mes", "Valor"), class = "data.frame", row.names = c("1",
"2", "3"))