How can I get highcharter to represent a forecast object? - r

This is a follow-on to this question.
I am trying to get the pipeline given in that question to accept a forecast object as input:
Again, using this data:
> dput(t)
structure(c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23, 108000151, 132505189,
29587564, 120381505, 25106680, 117506099, 22868767, 115940080,
22878163, 119286731, 22881061), .Dim = c(23L, 1L), index = structure(c(1490990400,
1490994000, 1490997600, 1491001200, 1491004800, 1491008400, 1491012000,
1491026400, 1491033600, 1491037200, 1491040800, 1491058800, 1491062400,
1491066000, 1491069600, 1491073200, 1491076800, 1491109200, 1491112800,
1491120000, 1491123600, 1491156000, 1491159600), tzone = "US/Mountain", tclass = c("POSIXct",
"POSIXt")), class = c("xts", "zoo"), .indexCLASS = c("POSIXct",
"POSIXt"), tclass = c("POSIXct", "POSIXt"), .indexTZ = "US/Mountain", tzone = "US/Mountain", .CLASS = "double", .Dimnames = list(
NULL, "count"))
I use
highchart(type = 'stock') %>%
hc_add_series(t) %>%
hc_xAxis(type = 'datetime')
To create
But if I follow this same recipe using
require("forecast")
t.arima <- auto.arima(t)
x <- forecast(t.arima, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x) %>%
hc_xAxis(type = 'datetime')
I get this error:
Error in as.Date.ts(.) : unable to convert ts time to Date class
How can I show the forecast series along with the historical? I've seen this in the documentation, but don't understand why I'd be getting this error.
JS CONSOLE OUTPUT FOR JK:
DF DATA AFTER RE-INDEXING:
dput(df)
structure(list(Index = structure(c(1490968800, 1490972400, 1490976000,
1490979600, 1490983200, 1490986800, 1490990400, 1491004800, 1491012000,
1491015600, 1491019200, 1491037200, 1491040800, 1491044400, 1491048000,
1491051600, 1491055200, 1491087600, 1491091200, 1491098400, 1491102000,
1491134400, 1491138000, 1491217200, 1491220800, 1491224400, 1491228000,
1491231600, 1491235200, 1491238800, 1491242400, 1491246000, 1491249600,
1491253200, 1491256800, 1491260400, 1491264000, 1491267600), class = c("POSIXct",
"POSIXt")), Data = c(2, 2, 259465771, 315866206, 64582553, 233440220,
91918347, 1, 126563786, 158555699, 32951026, 23, 108000151, 132505189,
29587564, 120381505, 25106680, 117506099, 22868767, 115898351,
22878163, 119285747, 22881061, 157925588, 32447780, 223096830,
281656273, 45406684, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
Fitted = c(102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA),
`Point Forecast` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143, 102170573.857143,
102170573.857143, 102170573.857143, 102170573.857143), `Lo 80` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723,
-16003477.5789723, -16003477.5789723, -16003477.5789723),
`Hi 80` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 220344625.293258, 220344625.293258, 220344625.293258,
220344625.293258, 220344625.293258, 220344625.293258, 220344625.293258,
220344625.293258, 220344625.293258, 220344625.293258), `Lo 95` = c(NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782,
-78561041.5917782, -78561041.5917782, -78561041.5917782),
`Hi 95` = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, 282902189.306064, 282902189.306064, 282902189.306064,
282902189.306064, 282902189.306064, 282902189.306064, 282902189.306064,
282902189.306064, 282902189.306064, 282902189.306064)), .Names = c("Index",
"Data", "Fitted", "Point Forecast", "Lo 80", "Hi 80", "Lo 95",
"Hi 95"), row.names = c(NA, -38L), class = "data.frame")

Not sure this is due to the irregular time series.
Anyway, ggfortify:::fortify.forecast is your friend. Why? Because fortify (try to) transform all the R object in data frames. So:
library(highcharter)
library(forecast)
t.arima <- auto.arima(t)
x <- forecast(t, level = c(95, 80))
library(highcharter)
library(ggplot2)
library(ggfortify)
#>
#> Attaching package: 'ggfortify'
#> The following object is masked from 'package:forecast':
#>
#> gglagplot
class(x)
#> [1] "forecast"
df <- fortify(x)
head(df)
#> Index Data Fitted Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
#> 1 1 2 140658844 NA NA NA NA NA
#> 2 3601 2 121734145 NA NA NA NA NA
#> 3 7201 267822980 105355638 NA NA NA NA NA
#> 4 10801 325286564 127214522 NA NA NA NA NA
#> 5 14401 66697091 153863779 NA NA NA NA NA
#> 6 18001 239352431 142136089 NA NA NA NA NA
Now you can:
highchart(type = "stock") %>%
hc_add_series(df, "line", hcaes(Index, Data), name = "Original") %>%
hc_add_series(df, "line", hcaes(Index, Fitted), name = "Fitted") %>%
hc_add_series(df, "line", hcaes(Index, `Point Forecast`), name = "Forecast") %>%
hc_add_series(df, "arearange", hcaes(Index, low = `Lo 80`, high = `Hi 80`), name = "Interval")
As you can see, fortify can't detect the real time too. So you need to transform the Index in the time what you want.

The error
Error in as.Date.ts(.) : unable to convert ts time to Date class
is due to the fact that you have a ts object with a frequency that is not covered by the function as.Date.ts(.). When we see what this function does, this is what we get:
function (x, offset = 0, ...)
{
time.x <- unclass(time(x)) + offset
if (frequency(x) == 1)
as.Date(paste(time.x, 1, 1, sep = "-"))
else if (frequency(x) == 4)
as.Date(paste((time.x + 0.001)%/%1, 3 * (cycle(x) - 1) +
1, 1, sep = "-"))
else if (frequency(x) == 12)
as.Date(paste((time.x + 0.001)%/%1, cycle(x), 1, sep = "-"))
else stop("unable to convert ts time to Date class")
}
This function considers only 3 values for the frequency of a ts object: 1, 4, or 12. When we take a look at the frequency of your object x, we see that its frequency = 0.000277777777777778, so when highcharter calls the function using the ts objects in x it stops and gives you that error.
We have two options on how to "fix" it:
Transform t into a ts object (instead of a xts object) with frequency = 1 before running auto.arima and forecast;
After running auto.arima and forecast, we can create an index for the future dates and transform the ts objects in x into xts objects with the correct index.
I said "fix" because these solutions are not perfect, as we will see.
Option 1
t <- structure(
c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23,
108000151, 132505189, 29587564, 120381505, 25106680,
117506099, 22868767, 115940080, 22878163, 119286731,
22881061),
.Dim = c(23L, 1L),
index = structure(c(1490990400, 1490994000, 1490997600,
1491001200, 1491004800, 1491008400,
1491012000, 1491026400, 1491033600,
1491037200, 1491040800, 1491058800,
1491062400, 1491066000, 1491069600,
1491073200, 1491076800, 1491109200,
1491112800, 1491120000, 1491123600,
1491156000, 1491159600),
tzone = "US/Mountain",
tclass = c("POSIXct","POSIXt")),
class = c("xts", "zoo"),
.indexCLASS = c("POSIXct","POSIXt"),
tclass = c("POSIXct", "POSIXt"),
.indexTZ = "US/Mountain",
tzone = "US/Mountain",
.CLASS = "double",
.Dimnames = list(NULL, "count"))
require("forecast")
library(highcharter)
# SOLUTION 1
t.tmp <- ts(t, start=1, end = length(t))
t.arima.1 <- auto.arima(t.tmp)
x.1 <- forecast(t.arima.1, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x.1) %>%
hc_add_series(x.1$x, name = "Original") %>%
hc_add_series(x.1$fitted, name = "Fitted")
The problem with this approach is that we lose the dates (axis, tooltip, etc.).
Option 2, 1st try: Hourly Forecasts
I tried to create an hourly index for the future values, but for some reason Highcharter moves the intervals to the left (or there's some problem with the dates that I can't see/figure out).
Option 2, 2nd try: Daily Forecasts
When I changed it to a daily index for the future values it worked, but it's weird since we have hourly observations and the forecast part of our plot shows "daily forecasts".
Here is the full code:
t <- structure(
c(2, 2, 267822980, 325286564, 66697091, 239352431,
94380295, 1, 126621669, 158555699, 32951026, 23,
108000151, 132505189, 29587564, 120381505, 25106680,
117506099, 22868767, 115940080, 22878163, 119286731,
22881061),
.Dim = c(23L, 1L),
index = structure(c(1490990400, 1490994000, 1490997600,
1491001200, 1491004800, 1491008400,
1491012000, 1491026400, 1491033600,
1491037200, 1491040800, 1491058800,
1491062400, 1491066000, 1491069600,
1491073200, 1491076800, 1491109200,
1491112800, 1491120000, 1491123600,
1491156000, 1491159600),
tzone = "US/Mountain",
tclass = c("POSIXct","POSIXt")),
class = c("xts", "zoo"),
.indexCLASS = c("POSIXct","POSIXt"),
tclass = c("POSIXct", "POSIXt"),
.indexTZ = "US/Mountain",
tzone = "US/Mountain",
.CLASS = "double",
.Dimnames = list(NULL, "count"))
require("forecast")
library(highcharter)
library(xts)
t.arima <- auto.arima(t)
x <- forecast(t.arima, level = c(95, 80))
# Problem
## Time from 'forecast'
time.x <- time(x$mean) # ts variable
time.x # see that frequency = 0.000277777777777778
## Original time
time.t <- time(t) # POSIXct variable, use as.ts to see frequency
as.ts(time.t) # frequency = 1
## Try to transform back to formatted date
as.POSIXct(as.double(time.t), tz = "US/Mountain", origin = "1970-01-01")
as.POSIXct(as.double(time.x), tz = "US/Mountain", origin = "1970-01-01")
#--------------------------------------------------------#
# SOLUTION 1
t.tmp <- ts(t, start=1, end = length(t))
t.arima.1 <- auto.arima(t.tmp)
x.1 <- forecast(t.arima.1, level = c(95, 80))
highchart(type = 'stock') %>%
hc_add_series(x.1) %>%
hc_add_series(x.1$x, name = "Original") %>%
hc_add_series(x.1$fitted, name = "Fitted")
#------------------------------------------------------#
# SOLUTION 2 - With correct dates but wrong plot
## Create new forecast variable
x.2 <- forecast(t.arima.1, level = c(95, 80))
## Take forecast length
forecast.length <- length(time.x)
### Create New Forecast dates (HOUR)
### Since I don't know the exact forecast times, I'll add one HOUR
### for each obs starting from the last date in the original dataset
last.date <- time.t[length(time.t)]
new.forecast.time.hour <- as.POSIXct(last.date) + c((1:forecast.length)*3600)
## Insert date back
x.2$mean <- xts(x.1$mean, order.by = new.forecast.time.hour)
x.2$lower <- xts(x.1$lower, order.by = new.forecast.time.hour)
x.2$upper <- xts(x.1$upper, order.by = new.forecast.time.hour)
### Original Data
x.2$x <- xts(x.1$x, order.by = time.t)
### Fitted
x.2$fitted <- xts(x.1$fitted, order.by = time.t)
# Plot forecasts with correct date
highchart(type = 'stock') %>%
hc_add_series(x.2) %>%
hc_add_series(x.2$x, name = "Original") %>%
hc_add_series(x.2$fitted, name = "Fitted") %>%
hc_xAxis(type = 'datetime')
#------------------------------------------------------#
# SOLUTION 3 - Correct plot but only for daily forecasts
## Create new forecast variable
x.3 <- forecast(t.arima.1, level = c(95, 80))
## Take forecast length
forecast.length <- length(time.x)
### Create New Forecast dates (DAY)
### Since I don't know the exact forecast times, I'll add one DAY
### for each obs starting from the last date in the original dataset
last.date <- time.t[length(time.t)]
new.forecast.time.day <- as.POSIXct(last.date) + c((1:forecast.length)*3600*24)
## Add change from as.POSIXct to as.Date
new.forecast.time.day <- as.Date(new.forecast.time.day)
## Insert date back
x.3$mean <- xts(x.1$mean, order.by = new.forecast.time.day)
x.3$lower <- xts(x.1$lower, order.by = new.forecast.time.day)
x.3$upper <- xts(x.1$upper, order.by = new.forecast.time.day)
### Original Data
x.3$x <- xts(x.1$x, order.by = time.t)
### Fitted
x.3$fitted <- xts(x.1$fitted, order.by = time.t)
# Plot forecasts with correct date
highchart(type = 'stock') %>%
hc_add_series(x.3) %>%
hc_add_series(x.3$x, name = "Original") %>%
hc_add_series(x.3$fitted, name = "Fitted") %>%
hc_xAxis(type = 'datetime')
One other thing: the fitted values on my plots differ from the fitted values on jbkunst's plot because he used forecast directly on t, not on t.arima (just a typo, I believe). This way, my forecasts are based on an Arima model, while his are based on an ETS model.

Related

Conditional statements for one row vs all rows

I don't know what's going on here. Do I have some logical flaw in the code?
I want to match two datasets by their time difference. One case is around 4hs different from the entry in the other set. I am calculating the difference, e.g.:
qnr$submitdate[10]-raw1$time[7]
Time difference of 4 hours
I am specifying a time window:
sum(qnr$submitdate[10]-raw1$time[7] <= 4 & qnr$submitdate[10]-raw1$time[7] > 3.995)
[1] 1
Perfect, 1 match!
Now when I am considering the whole data set, I get 0 matches, how can that be?
sum(qnr$submitdate[10]-raw1$time <= 4 & qnr$submitdate[10]-raw1$time > 3.995)
[1] 0
Specifically, I want to match an identifier:
for (i in 1:nrow(qnr)){
match <- raw1$subject[(qnr$submitdate[i]-raw1$time <= 4 & qnr$submitdate[i]-raw1$time > 3.995)]
if(length(match)>0) qnr$subject[i] <- match
}
this works, but only for some cases, not the one mentioned above. Can someone please help me and enlighten me?
Data:
qnr <- structure(list(submitdate = structure(c(1635427498, 1635427876,
1635428218, 1635429757, 1635430844, 1635432380, 1635435962, 1635453487,
1635464448, 1635508264, 1635509440, 1635509727, 1635510277, 1635511263,
1635511718, 1635514199, 1635514329, 1635517928, 1635519441, 1635519704,
1635520386, 1635521108, 1635522747, 1635525148, 1635526577), tzone = "UTC", class = c("POSIXct",
"POSIXt")), subject = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), row.names = c(NA,
-25L), class = c("tbl_df", "tbl", "data.frame"))
raw1 <- structure(list(time = structure(c(1635413099, 1635413819, 1635416446,
1635417980, 1635421563, 1635439088, 1635493864, 1635495041, 1635495326,
1635495876, 1635496863, 1635499803, 1635499932, 1635503528, 1635505042,
1635508347, 1635512177, 1635512850, 1635518752, 1635519382), class = c("POSIXct",
"POSIXt"), tzone = ""), subject = c("9wtd4kldpun6bhgq", "qbvhqxuw67x1eduw",
"k2dc9c88t3jcfssy", "vmvwfc6z7j236nhk", "7qo7ra1jj25ue3fb", "5xx9qkkb53nzxev5",
"o6zaaq469c7t2jps", "dfsj021ojphza6uc", "4k0l4a3yrb33hel1", "vf6usaa0cl8kz17t",
"f1wwfeoeekoru88z", "oe8e2u6w4a1f6f6m", "tnxxywtpsj8nejoa", "zht8w1bfhq4dk22l",
"atd314r9a4htlaal", "mwbh9eafxczk0x8u", "ke7m4qqp4aodd1fb", "v13fx76lsohsa1hh",
"8kvynhcvfs09g658", "5scqtdz8ha8cuxt1")), row.names = c(79226L,
26641L, 79425L, 79624L, 79823L, 26789L, 2961L, 3109L, 3257L,
47585L, 3405L, 3553L, 3701L, 47784L, 3849L, 47983L, 48182L, 48381L,
48580L, 48779L), class = c("data.table", "data.frame"))
The reason is that subtracting times uses the timediff-function which uses units="auto" as a standard.
It works, when changing
qnr$submitdate[i]-raw1$time
to
difftime(qnr$submitdate[i],raw1$time, units="hours")

r Replace multiple strings in a data frame column with multiple strings from a column of another data frame

I have a dataframe (df1) with a column "PartcipantID". Some ParticipantIDs are wrong and should be replaced with the correct ParticipantID. I have another dataframe (df2) where all Participant IDs appear in columns Goal_ID to T4. The Participant IDs in column "Goal_ID" are the correct IDs.
Now I want to replace all ParticipantIDs in df1 with all Goal_ID ParticipantIDs from df2.
This is my original dataframe (df1):
structure(list(Partcipant_ID = c("AA_SH_RA_91", "AA_SH_RA_91",
"AB_BA_PR_93", "AB_BH_VI_90", "AB_BH_VI_90", "AB_SA_TA_91", "AJ_BO_RA_92",
"AJ_BO_RA_92", "AK_SH_HA_91", "AL_EN_RA_95", "AL_MA_RA_95", "AL_SH_BA_99",
"AM_BO_AB_49", "AM_BO_AB_94", "AM_BO_AB_94", "AM_BO_AB_94", "AN_JA_AN_91",
"AN_KL_GE_11", "AN_KL_WO_91", "AN_MA_DI_95", "AN_MA_DI_95", "AN_SE_RA_95",
"AN_SE_RA_95", "AN_SI_RA_97", "AN_SO_PU_94", "AN_SU_RA_91", "AR_BO_RA_92",
"AR_KA_VI_94", "AR_KA_VI_94", "AS_AR_SO_90", "AS_AR_SU_95", "AS_KU_SO_90",
"AS_MO_AS_97", "AW_SI_OJ_97", "AW_SI_OJ_97", "AY_CH_SU_97", "BH_BE_LD_84",
"BH_BE_LI_83", "BH_BE_LI_83", "BH_BE_LI_84", "BH_KO_SA_87", "BH_PE_AB_89",
"BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"), Start_T2 = structure(c(NA,
NA, NA, NA, 1579514871, 1576658745, NA, 1579098225, NA, NA, 1576663067,
1576844759, NA, 1577330639, NA, NA, 1576693930, NA, NA, NA, NA,
NA, NA, NA, NA, NA, NA, 1577718380, 1577718380, 1577454467, NA,
NA, 1576352237, NA, NA, NA, NA, 1576420656, 1576420656, NA, NA,
1578031772, 1576872938, NA, NA), class = c("POSIXct", "POSIXt"
), tzone = "UTC"), End_T2 = structure(c(NA, NA, NA, NA, 1579515709,
1576660469, NA, 1579098989, NA, NA, 1576693776, 1576845312, NA,
1577331721, NA, NA, 1576694799, NA, NA, NA, NA, NA, NA, NA, NA,
NA, NA, 1577719049, 1577719049, 1577455167, NA, NA, 1576352397,
NA, NA, NA, NA, 1576421607, 1576421607, NA, NA, 1578032408, 1576873875,
NA, NA), class = c("POSIXct", "POSIXt"), tzone = "UTC")), row.names = c(NA,
45L), class = "data.frame")
And this is the reference data frame (df2):
structure(list(Goal_ID = c("AJ_BO_RA_92", "AL_EN_RA_95", "AM_BO_AB_49",
"AS_KU_SO_90", "BH_BE_LI_84", "BH_YA_SA_87", "BI_CH_PR_94", "BI_CH_PR_94"
), T2 = c("AJ_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", "AS_AR_SO_90",
"BH_BE_LI_83", "BH_YA_SA_87", "BI_NA_PR_94", "BI_NA_PR_94"),
T3 = c("AR_BO_RA_92", "AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83",
NA, "BI_CH_PR_94", "BI_CH_PR_94"), T4 = c("AJ_BO_RA_92",
"AL_MA_RA_95", "AM_BO_AB_94", NA, "BH_BE_LI_83", "BH_KO_SA_87",
"BI_CH_PR_94", "BI_CH_PR_94")), row.names = c(NA, -8L), class = c("tbl_df",
"tbl", "data.frame"))
For example, in my df1, I want
"AR_BO_RA_92" to be replaced by "AJ_BO_RA_92";
"AL_MA_RA_95" to be replaced by "AL_EN_RA_95";
"AM_BO_AB_94" to be replaced by "AM_BO_AB_49"
and so on...
I thought about using string_replace and I started with this:
df1$Partcipant_ID <- str_replace(df1$Partcipant_ID, "AR_BO_RA_92", "AJ_BO_RA_92")
But that is of course very unefficient because I have so many replacements and it would be nice to make use of my reference data frame. I just cannot figure it out myself.
I hope this is understandable. Please ask if you need additional information.
Thank you so much already!
You can use match to find where the string is located and excange those which have been found and are not NA like:
i <- match(df1$Partcipant_ID, unlist(df2[-1])) %% nrow(df2)
j <- !is.na(i)
df1$Partcipant_ID[j] <- df2$Goal_ID[i[j]]
df1$Partcipant_ID
# [1] "AA_SH_RA_91" "AA_SH_RA_91" "AB_BA_PR_93" "AB_BH_VI_90" "AB_BH_VI_90"
# [6] "AB_SA_TA_91" "AJ_BO_RA_92" "AJ_BO_RA_92" "AK_SH_HA_91" "AL_EN_RA_95"
#[11] "AL_MA_RA_95" "AL_SH_BA_99" "AM_BO_AB_49" "AM_BO_AB_94" "AM_BO_AB_94"
#[16] "AM_BO_AB_94" "AN_JA_AN_91" "AN_KL_GE_11" "AN_KL_WO_91" "AN_MA_DI_95"
#[21] "AN_MA_DI_95" "AN_SE_RA_95" "AN_SE_RA_95" "AN_SI_RA_97" "AN_SO_PU_94"
#[26] "AN_SU_RA_91" "AR_BO_RA_92" "AR_KA_VI_94" "AR_KA_VI_94" "AS_AR_SO_90"
#[31] "AS_AR_SU_95" "AS_KU_SO_90" "AS_MO_AS_97" "AW_SI_OJ_97" "AW_SI_OJ_97"
#[36] "AY_CH_SU_97" "BH_BE_LD_84" "BH_BE_LI_83" "BH_BE_LI_83" "BH_BE_LI_84"
#[41] "BH_KO_SA_87" "BH_PE_AB_89" "BH_YA_SA_87" "BI_CH_PR_94" "BI_CH_PR_94"
I think this might work. Create a true look up table with a column of correct and incorrect codes. I.e. stack the columns, then join the subsequent df3 to df1 and use coalesce to create a new part_id. You spelt participant wrong, which made me feel more human I always do that.
library(dplyr)
df3 <- df2[1:2] %>%
bind_rows(df2[c(1,3)] %>% rename(T2 = T3),
df2[c(1,4)] %>% rename(T2 = T4)) %>%
distinct()
df1 %>%
left_join(df3, by = c("Partcipant_ID" = "T2")) %>%
mutate(Goal_ID = coalesce(Goal_ID, Partcipant_ID)) %>%
select(Goal_ID, Partcipant_ID, Start_T2, End_T2)

linear regression model with dplyr on sepcified columns by name

I have the following data frame, each row containing four dates ("y") and four measurements ("x"):
df = structure(list(x1 = c(69.772808673525, NA, 53.13125414839,
17.3033274666411,
NA, 38.6120670385487, 57.7229000792707, 40.7654208618078, 38.9010405201831,
65.7108936694177), y1 = c(0.765671296296296, NA, 1.37539351851852,
0.550277777777778, NA, 0.83037037037037, 0.0254398148148148,
0.380671296296296, 1.368125, 2.5250462962963), x2 = c(81.3285388496182,
NA, NA, 44.369872853302, NA, 61.0746827226573, 66.3965114460601,
41.4256874481852, 49.5461413070349, 47.0936997726146), y2 =
c(6.58287037037037,
NA, NA, 9.09377314814815, NA, 7.00127314814815, 6.46597222222222,
6.2462962962963, 6.76976851851852, 8.12449074074074), x3 = c(NA,
60.4976916064608, NA, 45.3575294731303, 45.159758146854, 71.8459173097114,
NA, 37.9485456227131, 44.6307631013742, 52.4523342186143), y3 = c(NA,
12.0026157407407, NA, 13.5601157407407, 16.1213657407407, 15.6431018518519,
NA, 15.8986805555556, 13.1395138888889, 17.9432638888889), x4 = c(NA,
NA, NA, 57.3383407228293, NA, 59.3921356160536, 67.4231673171527,
31.853845252547, NA, NA), y4 = c(NA, NA, NA, 18.258125, NA,
19.6074768518519,
20.9696527777778, 23.7176851851852, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -10L))
I would like to create an additional column containing the slope of all the y's versus all the x's, for each row (each row is a patient with these 4 measurements).
Here is what I have so far:
df <- df %>% mutate(Slope = lm(vars(starts_with("y") ~
vars(starts_with("x"), data = .)
I am getting an error:
invalid type (list) for variable 'vars(starts_with("y"))'...
What am I doing wrong, and how can I calculate the rowwise slope?
You are using a tidyverse syntax but your data is not tidy...
Maybe you should rearrange your data.frame and rethink the way you store your data.
Here is how to do it in a quick and dirty way (at least if I understood your explanations correctly):
df <- merge(reshape(df[,(1:4)*2-1], dir="long", varying = list(1:4), v.names = "x", idvar = "patient"),
reshape(df[,(1:4)*2], dir="long", varying = list(1:4), v.names = "y", idvar = "patient"))
df$patient <- factor(df$patient)
Then you could loop over the patients, perform a linear regression and get the slopes as a vector:
sapply(levels(df$patient), function(pat) {
coef(lm(y~x,df[df$patient==pat,],na.action = "na.omit"))[2]
})

R: Pearson correlation in a loop, prevent stopping when an error occurs and output NAs

I want to run Pearson correlations of each row of a matrix (dat) vs a vector (v1), as part of a loop, and output the correlation coefficients and associated p-values in a table. Here is an example for random data (data pasted at the end):
result_table <- data.frame(matrix(ncol = 2, nrow = nrow(dat)))
colnames(result_table) <- c("correlation_coefficient", "pvalue")
for(i in 1:nrow(dat)){
print(i)
corr <- cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit")
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}
When cor.test() removes missing data, sometimes there are not enough observations remaining and the loop stops with an error (for example at row 11). I would like the loop to continue running, just leaving the values in the result table as NAs. I think the result table should then look like this:
> result_table
correlation_coefficient pvalue
1 0.68422642 0.04206591
2 -0.15895586 0.70694013
3 -0.37005028 0.53982309
4 0.08448970 0.89255250
5 0.86860091 0.05603661
6 0.19544883 0.75274040
7 -0.94695380 0.01454887
8 -0.03817885 0.94275955
9 -0.15214122 0.77354897
10 -0.22997890 0.70978386
11 NA NA
12 NA NA
13 -0.27769887 0.59415930
14 -0.09768153 0.81800885
15 -0.20986632 0.61790214
16 -0.40474976 0.31990456
17 -0.00605937 0.98863896
18 0.02176976 0.95919460
19 -0.14755097 0.72733118
20 -0.25830856 0.50216600
I would also like the errors to keep being printed
Here is the data:
> dput(v1)
c(-0.840396, 0.4746047, -1.101857, 0.5164767, 1.2203134, -0.9758888,
-0.3657913, -0.6272523, -0.5853803, 1.7367901)
> dput(dat)
structure(list(s1 = c(-0.52411895, 0.14709633, 0.05433954, 0.7504406,
-0.59971988, -0.59679685, -0.12571854, 0.73289705, -0.71668771,
-0.04813957, -0.67849896, -0.11947141, -0.26371884, -1.34137162,
2.60928064, -1.23397547, 0.51811222, -4.10759883, -0.70127093,
7.51914575), s2 = c(0.21446623, -0.27281487, NA, NA, NA, NA,
NA, NA, -0.62468391, NA, NA, NA, -3.84387999, 0.64010069, NA,
NA, NA, NA, NA, NA), s3 = c(0.3461212, 0.279062, NA, NA, NA,
-0.4737744, 0.6313365, -2.8472641, 1.2647846, 2.2524449, -0.7913039,
-0.752590307, -3.535815266, 1.692385187, 3.55789764, -1.694910854,
-3.624517121, -4.963855198, 2.395998161, 5.35680032), s4 = c(0.3579742,
0.3522745, -1.1720907, 0.4223402, 0.146605, -0.3175295, -1.383926807,
-0.688551166, NA, NA, NA, NA, NA, 0.703612974, 1.79890268, -2.625404608,
-3.235884921, -2.845474098, 0.058650461, 1.83900702), s5 = c(1.698104376,
NA, NA, NA, NA, NA, -1.488000007, -0.739488766, 0.276012387,
0.49344994, NA, NA, -1.417434166, -0.644962513, 0.04010434, -3.388182254,
2.900252493, -1.493417096, -2.852256003, -0.98871696), s6 = c(0.3419271,
0.2482013, -1.2230283, 0.270752, -0.6653978, -1.1357202, NA,
NA, NA, NA, NA, NA, NA, NA, -1.0288213, -1.17817328, 6.1682455,
1.02759131, -3.80372867, -2.6249692), s7 = c(0.3957243, 0.8758406,
NA, NA, NA, NA, NA, 0.60196247, -1.28631859, -0.5754757, NA,
NA, NA, NA, NA, NA, NA, NA, NA, -2.6303001), s8 = c(-0.26409595,
1.2643281, 0.05687957, -0.09459169, -0.7875279, NA, NA, NA, NA,
NA, NA, NA, 2.42442997, -0.00445559, -1.0341522, 2.47315322,
0.1190265, 5.82533417, 0.82239131, -0.8279679), s9 = c(0.237123,
-0.5004619, 0.4447322, -0.2155249, -0.2331443, 1.3438071, -0.3817672,
1.9228182, 0.305661, -0.01348, NA, NA, 3.4009042, 0.8268469,
0.2061843, -1.1228663, -0.1443778, 4.8789902, 1.3480328, 0.4258486
), s10 = c(NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,
0.5211859, 0.2196643, -1.2333367, 0.1186947, 1.478086, 0.5211859,
0.2196643)), .Names = c("s1", "s2", "s3", "s4", "s5", "s6", "s7",
"s8", "s9", "s10"), class = "data.frame", row.names = c(NA, -20L
))
A solution with tryCatch could be
for(i in 1:nrow(dat)){
print(i)
corr <- tryCatch(cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit"), error = function(e) return(NA))
if(length(corr) == 1){
result_table[i,1] <- NA
result_table[i,2] <- NA
}else{
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}
}
Here is a solution with tryCatch():
Replacing the for loop with:
for(i in 1:nrow(dat)){
tryCatch({
print(i)
corr <- cor.test(as.numeric(dat[i,]), v1, na.action = "na.omit") # Correlation miRNA activity vs CNVs for that gene
result_table[i,1] <- corr$estimate
result_table[i,2] <- corr$p.value
}, error=function(e){cat("ERROR :",conditionMessage(e), "\n")})
}

custom rmeta - forest plot generation does not work: " 'x' and 'units' must have length > 0"

I tried to generate a "forest plot" without summary estimates using the rmeta package. However, using ?forestplot and then starting from the description or the example does not help, I am always getting the same error. I would assume that it is a simple one that has to do with the matrix/vector lengths somewhat not lining up but I kept changing and adjusting and still cannot find the error...
Here is the example code:
tabletext<-cbind(c(NA, NA, NA, NA, NA, NA),
c(NA, NA, NA, NA, NA, NA),
c("variable1","subgroup","2nd", "3rd", "4th", "5th"),
c(NA,"mean","1.8683639", "2.5717301", "4.4966049, 9.0008054")
)
tabletext
png("forestplot.png")
forestplot(tabletext, mean = c(NA, NA, 1.8683639, 2.5717301, 4.4966049, 9.0008054), lower = c(NA, NA, 1.4604643, 2.0163468, 3.5197956, 6.9469213), upper = c(NA, NA, 2.3955105, 3.2897459, 5.7672966, 11.7288609),
is.summary = c(rep(FALSE, 6)), zero = 1, xlog=FALSE, boxsize=0.75, xticks = NULL, clip = c(0.9, 12))
dev.off()
Error message:
clip = c(0.9, 12))
Error in unit(rep(1, sum(widthcolumn)), "grobwidth", labels[[1]][widthcolumn]) :
'x' and 'units' must have length > 0
dev.off()
Any help is very much appreciated!
This works with the forestplot-package although you need to remove the xticks=NULL:
tabletext<-cbind(c(NA, NA, NA, NA, NA, NA),
c(NA, NA, NA, NA, NA, NA),
c("variable1","subgroup","2nd", "3rd", "4th", "5th"),
c(NA,"mean","1.8683639", "2.5717301", "4.4966049, 9.0008054")
)
png("forestplot.png")
forestplot(tabletext,
mean = c(NA, NA, 1.8683639, 2.5717301, 4.4966049, 9.0008054),
lower = c(NA, NA, 1.4604643, 2.0163468, 3.5197956, 6.9469213),
upper = c(NA, NA, 2.3955105, 3.2897459, 5.7672966, 11.7288609),
is.summary = c(rep(FALSE, 6)), zero = 1,
xlog=FALSE, boxsize=0.75, clip = c(0.9, 12))
dev.off()
Gives (I recommend some polishing before submitting for publishing):

Resources