create sql expression in R for certain condition - r

I get the data from the sql server to perform regression analysis, and then the regression results i return back to another sql table.
library("RODBC")
library(sqldf)
dbHandle <- odbcDriverConnect("driver={SQL Server};server=MYSERVER;database=MYBASE;trusted_connection=true")
sql <-
"select
Dt
,CustomerName
,ItemRelation
,SaleCount
,DocumentNum
,DocumentYear
,IsPromo
from dbo.mytable"
df <- sqlQuery(dbHandle, sql)
After this query i must perform regression analysis separately for groups
my_lm <- function(df) {
lm(SaleCount~IsPromo, data = df)
}
reg=df %>%
group_by(CustomerName,ItemRelation,DocumentNum,DocumentYear) %>%
nest() %>%
mutate(fit = map(data, my_lm),
tidy = map(fit, tidy)) %>%
select(-fit, - data) %>%
unnest()
View(reg)
#save to sql table
sqlSave(dbHandle, as.data.frame(reg), "dbo.mytableforecast", verbose = TRUE) # use "append = TRUE" to add rows to an existing table
odbcClose(dbHandle)
The question:
The script works automatically, i.e. in the scheduler there is task that script in certain time was launched.
For example, today was loaded 100 observations.
From 01.01.2017-10.04.2017
Script performed regression and returned data to sql table.
Tomorrow will loaded new 100 observations.
11.04.2017-20.07.2017
I.E. when tomorrow the data will loaded and the script will start at 10 pm, it must work only with data from 11.04.2017-20.07.2017, and not from 01.01.2017-20.07.2017
the situation is complicated by the fact that after the regression the column Dt is dropped, so the solution given me here does not work
Automatic transfer data from the sql to R
because Dt is absent.
How can i set the condition for schedule select Dt ,CustomerName ,ItemRelation ,SaleCount ,DocumentNum ,DocumentYear ,IsPromo from dbo.mytable "where Dt>the last date when the script was launched"
is it possible to create this expression?
data example from sql
df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L,
15L, 15L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L, 18L, 18L,
18L, 19L), .Label = c("2017-10-12 00:00:00.000", "2017-10-13 00:00:00.000",
"2017-10-14 00:00:00.000", "2017-10-15 00:00:00.000", "2017-10-16 00:00:00.000",
"2017-10-17 00:00:00.000", "2017-10-18 00:00:00.000", "2017-10-19 00:00:00.000",
"2017-10-20 00:00:00.000", "2017-10-21 00:00:00.000", "2017-10-22 00:00:00.000",
"2017-10-23 00:00:00.000", "2017-10-24 00:00:00.000", "2017-10-25 00:00:00.000",
"2017-10-26 00:00:00.000", "2017-10-27 00:00:00.000", "2017-10-28 00:00:00.000",
"2017-10-29 00:00:00.000", "2017-10-30 00:00:00.000"), class = "factor"),
CustomerName = structure(c(1L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L), .Label = c("x1", "x10", "x11", "x12", "x13", "x14",
"x15", "x16", "x17", "x18", "x2", "x3", "x4", "x5", "x6",
"x7", "x8", "x9"), class = "factor"), ItemRelation = c(13322L,
13322L, 13322L, 13322L, 13322L, 13322L, 13322L, 11706L, 13322L,
11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13322L,
11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13163L,
13322L, 158010L, 11706L, 13163L, 13322L, 158010L, 11706L,
13163L, 13322L, 158010L, 11706L), SaleCount = c(10L, 3L,
1L, 0L, 9L, 5L, 5L, 11L, 7L, 0L, 5L, 11L, 1L, 0L, 0L, 19L,
10L, 0L, 1L, 12L, 1L, 11L, 6L, 0L, 167L, 7L, 0L, 16L, 165L,
1L, 0L, 0L, 29L, 0L, 0L, 11L), DocumentNum = c(36L, 36L,
36L, 36L, 36L, 36L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L,
36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 131L, 36L,
89L, 51L, 131L, 36L, 89L, 51L, 131L, 36L, 89L, 51L), DocumentYear = c(2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L),
IsPromo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("Dt", "CustomerName",
"ItemRelation", "SaleCount", "DocumentNum", "DocumentYear", "IsPromo"
), class = "data.frame", row.names = c(NA, -36L))

Consider saving the max DT (retrieved before regression that drops field) in a log file at the end of your scheduled script, then add a log read-in at beginning of script for the last logged date to include in WHERE clause:
# READ DATE FROM LOG FILE
log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
# QUERY WITH WHERE CLAUSE
sql <- paste0("SELECT Dt, CustomerName, ItemRelation, SaleCount,
DocumentNum, DocumentYear, IsPromo
FROM dbo.mytable WHERE Dt > '", log_dt, "'")
df <- sqlQuery(dbHandle, sql)
# RETRIEVE MAX DATE VALUE
max_DT <- as.character(max(df$Dt))
# ... regression
# WRITE DATE TO LOG FILE
cat(max_DT, file="/path/to/SQL_MaxDate.txt")
Better yet, use parameterization with RODBCext to avoid string concatenation and quoting:
library(RODBC)
library(RODBCext)
# READ DATE FROM LOG FILE
log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
dbHandle <- odbcDriverConnect(...)
# PREPARED STATEMENT WITH PLACEHOLDER
sql <- "SELECT Dt, CustomerName, ItemRelation, SaleCount,
DocumentNum, DocumentYear, IsPromo
FROM dbo.mytable WHERE Dt > ?")
# EXECUTE QUERY BINDING PARAM VALUE
df <- sqlExecute(dbHandle, sql, log_dt, fetch=TRUE)
# RETRIEVE MAX DATE VALUE
max_DT <- as.character(max(df$Dt))
# ... regression
# WRITE DATE TO LOG FILE
cat(max_DT, file="/path/to/SQL_MaxDate.txt")

Related

ANOVA error: why is each row of output *not* identified by a unique combination of keys?

I have a two-way ANOVA test (w/repeated measures) that I'm using with four almost identical datasets:
> res.aov <- anova_test(
+ data = LST_Weather_dataset_N, dv = LST, wid = Month,
+ within = c(Buffer, TimePeriod),
+ effect.size = "ges",
+ detailed = TRUE,
+ )
Where:
LST = surface temperature deviation in C
Month = 1-12
Buffer = a value 100-1900 - one of 19 areas outward from the boundary of a solar power plant (each 100m wide)
TimePeriod = a factor with a value of 1 or 2 corresponding to pre-/post-construction of a solar power plant.
For one dataset I get the error:
Error: Each row of output must be identified by a unique combination of keys.
Keys are shared for 38 rows:
* 10, 11
* 217, 218
* 240, 241
* 263, 264
* 286, 287
* 309, 310
* 332, 333
...
As far as I can tell I have unique combinations.
dplyr::count(LST_Weather_dataset_N, LST, Month, Buffer, TimePeriod, sort = TRUE)
returns
LST Month Buffer TimePeriod n
1 -6.309045316 12 100 2 1
2 -5.655279925 9 1000 2 1
3 -5.224196295 12 200 2 1
4 -5.194473224 9 1100 2 1
5 -5.025429891 12 400 2 1
6 -4.987575966 9 700 2 1
7 -4.979453868 12 600 2 1
8 -4.825298768 12 300 2 1
9 -4.668994574 12 500 2 1
10 -4.652282192 12 700 2 1
...
'n' is always 1.
I can't work out why this is happening.
Extract of datafram below:
> dput(LST_Weather_dataset_N[sample(1:nrow(LST_Weather_dataset_N), 50),])
structure(list(Buffer = c(1400L, 700L, 300L, 1400L, 100L, 200L,
1700L, 100L, 800L, 1900L, 1100L, 100L, 700L, 800L, 1400L, 400L,
1300L, 200L, 1200L, 500L, 1200L, 1300L, 400L, 1000L, 1300L, 1100L,
100L, 300L, 300L, 600L, 1100L, 1400L, 1500L, 1600L, 1700L, 1800L,
1700L, 1300L, 1200L, 300L, 1100L, 1900L, 1700L, 700L, 1400L,
1200L, 1600L, 1700L, 1900L, 1300L), Date = c("02/05/2014", "18/01/2017",
"19/06/2014", "25/12/2013", "15/09/2017", "08/04/2017", "22/08/2014",
"21/07/2014", "13/07/2017", "25/12/2013", "22/10/2013", "02/05/2014",
"07/03/2017", "15/03/2014", "13/07/2017", "19/06/2014", "25/12/2013",
"17/10/2017", "16/04/2014", "06/10/2013", "15/09/2017", "18/01/2017",
"10/01/2014", "17/12/2016", "13/07/2017", "19/06/2014", "07/03/2017",
"15/03/2014", "11/02/2014", "22/10/2013", "06/10/2013", "15/09/2017",
"16/04/2014", "18/01/2017", "15/03/2014", "21/07/2014", "17/10/2017",
"15/09/2017", "10/01/2014", "23/09/2014", "16/04/2014", "22/10/2013",
"11/06/2017", "26/05/2017", "19/06/2014", "14/08/2017", "11/02/2014",
"26/02/2017", "26/02/2017", "11/02/2014"), LST = c(1.255502397,
4.33385966, 3.327025603, -0.388631166, -0.865430798, 4.386292648,
-0.243018665, 3.276865987, 0.957036835, -0.065821795, 0.69731779,
4.846851651, -1.437700684, 1.003808572, 0.572460421, 2.995902374,
-0.334633662, -1.231447567, 0.644520741, 0.808262029, -3.392959991,
2.324569449, 2.346707612, -3.124354627, 0.58719862, 1.904859254,
1.701580958, 2.792443253, 1.638270039, 1.460743317, 0.699767335,
-3.015643366, 0.930527864, 1.309519336, 0.477789664, 0.147584938,
-0.498188865, -3.506795723, -1.007487965, 1.149604087, 1.192366386,
0.197471474, 0.999391224, -0.190613618, 1.27324015, 2.686622796,
0.573109026, 0.97847983, 0.395005095, -0.40855426), Month = c(5L,
1L, 6L, 12L, 9L, 4L, 8L, 7L, 7L, 12L, 10L, 5L, 3L, 3L, 7L, 6L,
12L, 10L, 4L, 10L, 9L, 1L, 1L, 12L, 7L, 6L, 3L, 3L, 2L, 10L,
10L, 9L, 4L, 1L, 3L, 7L, 10L, 9L, 1L, 9L, 4L, 10L, 6L, 5L, 6L,
8L, 2L, 2L, 2L, 2L), Year = c(2014L, 2017L, 2014L, 2013L, 2017L,
2017L, 2014L, 2014L, 2017L, 2013L, 2013L, 2014L, 2017L, 2014L,
2017L, 2014L, 2013L, 2017L, 2014L, 2013L, 2017L, 2017L, 2014L,
2016L, 2017L, 2014L, 2017L, 2014L, 2014L, 2013L, 2013L, 2017L,
2014L, 2017L, 2014L, 2014L, 2017L, 2017L, 2014L, 2014L, 2014L,
2013L, 2017L, 2017L, 2014L, 2017L, 2014L, 2017L, 2017L, 2014L
), JulianDay = c(122L, 18L, 170L, 359L, 258L, 98L, 234L, 202L,
194L, 359L, 295L, 122L, 66L, 74L, 194L, 170L, 359L, 290L, 106L,
279L, 258L, 18L, 10L, 352L, 194L, 170L, 66L, 74L, 42L, 295L,
279L, 258L, 106L, 18L, 74L, 202L, 290L, 258L, 10L, 266L, 106L,
295L, 162L, 146L, 170L, 226L, 42L, 57L, 57L, 42L), TimePeriod = c(1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
1L), Temperature = c(28L, 9L, 31L, 12L, 27L, 21L, 29L, 36L, 38L,
12L, 23L, 28L, 12L, 21L, 38L, 31L, 12L, 23L, 25L, 22L, 27L, 9L,
11L, 7L, 38L, 31L, 12L, 21L, 14L, 23L, 22L, 27L, 25L, 9L, 21L,
36L, 23L, 27L, 11L, 31L, 25L, 23L, 29L, 27L, 31L, 34L, 14L, 16L,
16L, 14L), Humidity = c(6L, 34L, 7L, 31L, 29L, 22L, 34L, 15L,
19L, 31L, 16L, 6L, 14L, 14L, 19L, 7L, 31L, 12L, 9L, 12L, 29L,
34L, 33L, 18L, 19L, 7L, 14L, 14L, 31L, 16L, 12L, 29L, 9L, 34L,
14L, 15L, 12L, 29L, 33L, 18L, 9L, 16L, 8L, 13L, 7L, 13L, 31L,
31L, 31L, 31L), Wind_speed = c(6L, 0L, 6L, 7L, 13L, 33L, 6L,
20L, 9L, 7L, 0L, 6L, 0L, 6L, 9L, 6L, 7L, 6L, 0L, 7L, 13L, 0L,
0L, 35L, 9L, 6L, 0L, 6L, 6L, 0L, 7L, 13L, 0L, 0L, 6L, 20L, 6L,
13L, 0L, 0L, 0L, 0L, 24L, 11L, 6L, 24L, 6L, 26L, 26L, 6L), Wind_gust = c(0L,
0L, 0L, 0L, 0L, 54L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 39L,
0L, 41L, 41L, 0L), Wind_trend = c(1L, 0L, 1L, 1L, 2L, 2L, 0L,
1L, 2L, 1L, 0L, 1L, 0L, 1L, 2L, 1L, 1L, 0L, 0L, 2L, 2L, 0L, 1L,
1L, 2L, 1L, 0L, 1L, 1L, 0L, 2L, 2L, 0L, 0L, 1L, 1L, 0L, 2L, 1L,
1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Wind_direction = c(0,
0, 0, 337.5, 360, 22.5, 0, 22.5, 0, 337.5, 0, 0, 0, 0, 0, 0,
337.5, 180, 0, 247.5, 360, 0, 0, 180, 0, 0, 0, 0, 337.5, 0, 247.5,
360, 0, 0, 0, 22.5, 180, 360, 0, 0, 0, 0, 360, 22.5, 0, 360,
337.5, 360, 360, 337.5), Pressure = c(940.2, 943.64, 937.69,
951.37, 932.69, 933.94, 937.07, 938.01, 937.69, 951.37, 939.72,
940.2, 948.33, 947.71, 937.69, 937.69, 951.37, 943.32, 932.69,
944.71, 932.69, 943.64, 942.31, 943.01, 937.69, 937.69, 948.33,
947.71, 941.94, 939.72, 944.71, 932.69, 932.69, 943.64, 947.71,
938.01, 943.32, 932.69, 942.31, 938.94, 932.69, 939.72, 928.31,
931.12, 937.69, 932.37, 941.94, 936.13, 936.13, 941.94), Pressure_trend = c(1L,
2L, 0L, 2L, 0L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 0L, 2L,
1L, 2L, 1L, 0L, 2L, 2L, 2L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 2L,
2L, 1L, 1L, 1L, 0L, 2L, 1L, 2L, 1L, 0L, 0L, 0L, 1L, 1L, 2L, 2L,
1L)), row.names = c(179L, 14L, 195L, 426L, 306L, 118L, 299L,
229L, 244L, 436L, 374L, 153L, 90L, 91L, 256L, 197L, 424L, 348L,
137L, 355L, 328L, 26L, 7L, 419L, 254L, 211L, 78L, 81L, 43L, 359L,
373L, 332L, 143L, 32L, 109L, 263L, 393L, 330L, 23L, 309L, 135L,
398L, 224L, 166L, 217L, 290L, 69L, 72L, 76L, 63L), class = "data.frame")
Well, this is a bit embarrassing.
The error arose as there were not, in fact, paired months of the data. Rather than there being 38 data (19x2) for each month, due to an error in determining the month value one month had 57 data (19x3). Correcting this, and checking that each month had the same number of paired data for the ANOVA allowed the test to run sucessfully.
> res.aov <- anova_test(
+ data = LST_Weather_dataset_N, dv = LST, wid = Month,
+ within = c(Buffer, TimePeriod),
+ effect.size = "ges",
+ detailed = TRUE,
+ )
> get_anova_table(res.aov, correction = "auto")
ANOVA Table (type III tests)
Effect DFn DFd SSn SSd F p p<.05 ges
1 (Intercept) 1 11 600.135 974.584 6.774 2.50e-02 * 0.189
2 Buffer 18 198 332.217 331.750 11.015 2.05e-21 * 0.115
3 TimePeriod 1 11 29.561 977.945 0.333 5.76e-01 0.011
4 Buffer:TimePeriod 18 198 13.055 283.797 0.506 9.53e-01 0.005
I still don't understand how the error message was telling me this, though.

Percentile in a data frame using two columns

Perhaps it´s an easy problem but I´m stuck.
My data frame (which come from a yearly survey) contains length data of several especies by year and by haul. I want to obtain, for each year, the 95 percentile for each species. A sample of my dataframe,
structure(list(year = c(2015L, 2015L, 2015L, 2015L, 2014L, 2016L,
2015L, 2016L, 2014L, 2016L, 2015L, 2015L, 2016L, 2016L, 2014L, 2014L,
2014L, 2015L, 2016L, 2016L), cod_haul = structure(c(72L, 51L, 77L,
43L, 20L, 92L, 75L, 93L, 9L, 103L, 65L, 63L, 85L, 102L, 27L, 24L,
14L, 55L, 114L, 105L), .Label = c("N14_02", "N14_03", "N14_04",
"N14_06", "N14_07", "N14_08", "N14_10", "N14_13", "N14_16", "N14_17",
"N14_19", "N14_21", "N14_24", "N14_25", "N14_26", "N14_27", "N14_28",
"N14_29", "N14_30", "N14_32", "N14_33", "N14_35", "N14_37", "N14_39",
"N14_40", "N14_41", "N14_42", "N14_44", "N14_51", "N14_54", "N14_55",
"N14_56", "N14_57", "N14_58", "N14_61", "N14_62", "N14_64", "N14_66",
"N14_67", "N15_01", "N15_03", "N15_07", "N15_11", "N15_12", "N15_14",
"N15_16", "N15_18", "N15_19", "N15_20", "N15_22", "N15_23", "N15_24",
"N15_25", "N15_26", "N15_27", "N15_28", "N15_29", "N15_30", "N15_31",
"N15_32", "N15_36", "N15_37", "N15_39", "N15_41", "N15_44", "N15_46",
"N15_47", "N15_48", "N15_52", "N15_55", "N15_56", "N15_58", "N15_59",
"N15_60", "N15_62", "N15_63", "N15_64", "N15_66", "N15_67", "N16_04",
"N16_06", "N16_07", "N16_08", "N16_11", "N16_12", "N16_13", "N16_15",
"N16_17", "N16_18", "N16_20", "N16_22", "N16_23", "N16_25", "N16_28",
"N16_29", "N16_30", "N16_31", "N16_32", "N16_33", "N16_34", "N16_35",
"N16_37", "N16_40", "N16_41", "N16_45", "N16_46", "N16_47", "N16_48",
"N16_49", "N16_50", "N16_51", "N16_52", "N16_53", "N16_54", "N16_56",
"N16_58", "N16_60", "N16_61", "N16_62", "N16_63", "N16_64","N16_66"),
class = "factor"), haul = c(58L, 23L, 64L, 11L, 32L, 23L, 62L, 25L,
16L, 40L, 44L, 39L, 12L, 37L, 42L, 39L, 25L, 27L, 54L, 45L), name =
structure(c(2L, 23L, 11L, 2L, 19L, 15L, 18L, 16L, 3L, 21L, 16L, 21L,
20L, 19L, 3L, 18L, 16L, 11L, 7L, 13L), .Label = c("Argentina
sphyraena", "Arnoglossus laterna", "Blennius ocellaris", "Boops
boops", "Callionymus lyra", "Callionymus maculatus", "Capros aper",
"Cepola macrophthalma", "Chelidonichthys cuculus", "Chelidonichthys
lucerna", "Conger conger", "Eutrigla gurnardus", "Gadiculus
argenteus", "Galeus melastomus", "Helicolenus dactylopterus",
"Lepidorhombus boscii", "Lepidorhombus whiffiagonis", "Merluccius
merluccius", "Microchirus variegatus", "Micromesistius poutassou",
"Phycis blennoides", "Raja clavata", "Scyliorhinus canicula",
"Solea solea", "Trachurus trachurus", "Trisopterus luscus"), class
= "factor"), length = c(9L, 18L, 50L, 12L, 14L, 12L, 31L, 19L, 15L,
16L, 26L, 48L, 23L, 10L, 16L, 24L, 12L, 46L, 75L, 13L), number =
c(5L, 4L, 1L, 2L, 29L, 5L, 2L, 14L, 1L, 1L, 4L, 1L, 29L, 21L, 2L,
1L, 2L, 1L, 2L, 14L)), row.names = c(NA, 20L), class =
"data.frame")
I haven't been able to find how to solve it even though I have tried several approaches, but none worked.
Any suggestions or advice is much appreciated.
Thanks!
Ps: Although it isn´t absolutely necessary, it would be great if the percentile could be added to the dataframe as a new column.
df %>%
group_by(year) %>%
summarize(species.95 = quantile(species, 0.95)
I cannot download your dataframe but you can use the quantile function to find the 95% for each species.
if I get you right
library(tidyverse) "collector")), skip = 1L), class = "col_spec"))
df %>%
group_by(year, name) %>%
mutate(q95 = quantile(length, probs = 0.95))
or
library(data.table)
setDT(df)
df[, q95 := quantile(length, probs = 0.95), by = list(year, name)][order(name, year)]

Conducting regression analysis using R via SQL Server 2017

I want perform regression analysis using R code via SQL Server 2017 (it's integrated here).
Here is the native R code working with the csv
The main matter of code that we perform regression separately by groups [CustomerName]+[ItemRelation]+[DocumentNum]+[DocumentYear]
df=read.csv("C:/Users/synthex/Desktop/re.csv", sep=";",dec=",")
#load needed library
library(tidyverse)
library(broom)
#order dataset
df=df[ order(df[,5]),]
df=df[ order(df[,6]),]
#delete signs
df$Customer<-gsub("\\-","",df$Customer)
#create lm function for separately by group regression
my_lm <- function(df) {
lm(SaleCount~IsPromo, data = df)
}
reg=df %>%
group_by(CustomerName,ItemRelation,DocumentNum,DocumentYear) %>%
nest() %>%
mutate(fit = map(data, my_lm),
tidy = map(fit, tidy)) %>%
select(-fit, - data) %>%
unnest()
w=aggregate(df$action, by=list(CustomerName=df$CustomerName,ItemRelation=df$ItemRelation, DocumentNum=df$DocumentNum, DocumentYear=df$DocumentYear), FUN=sum)
View(w)
# multiply each group by the number of days of the action
EA<-data.frame(reg$CustomerName,reg$ItemRelation,reg$DocumentNum,reg$DocumentYear, reg$estimate*w$x)
#del intercepts
toDelete <- seq(2, nrow(EA), 2)
newdat=EA[ toDelete ,]
View(newdat)
The finished result: this code runs in SSMS
So what I did:
EXECUTE sp_execute_external_script
#language = N'R'
, #script = N' OutputDataSet <- InputDataSet;'
, #input_data_1 = N' SELECT [CustomerName]
,[ItemRelation]
,[SaleCount]
,[DocumentNum]
,[DocumentYear]
,[IsPromo]
FROM [Action].[dbo].[promo_data];'
WITH RESULT SETS (([CustomerName] nvarchar(max) NOT NULL, [ItemRelation] int NOT NULL,
[SaleCount] int NOT NULL,[DocumentNum] int NOT NULL,
[DocumentYear] int NOT NULL, [IsPromo] int NOT NULL));
df=as.data.frame(InputDataSet)
Message 102, level 15, state 1, line 17
Incorrect syntax near the "=" construct.
So, how perform regression analysis in SQL separately by groups?
Note, all coefficients must be saved, because new data come to the sql, should already automatically calculate by the equation of constructed model for each group.
The above code simply estimates the impact of the action, the beta coefficients of each group multiplies by the number of days of the action for each group.
If it is needed, here is a reproducible example:
df=structure(list(CustomerName = structure(c(1L, 2L, 3L, 3L, 1L,
2L, 3L, 3L, 4L, 4L, 4L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L,
2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L), .Label = c("Attacks of the vehicle",
"Auchan TS", "Tape of the vehicle", "X5 Retail Group"), class = "factor"),
ItemRelation = c(13322L, 13322L, 158121L, 158122L, 13322L,
13322L, 158121L, 158122L, 11592L, 13189L, 13191L, 13322L,
13322L, 158121L, 158122L, 11592L, 13189L, 13191L, 158121L,
158121L, 158122L, 158122L, 13322L, 13322L, 158121L, 158122L,
11592L, 13189L, 13191L, 157186L, 157192L, 158009L, 158010L,
158121L, 158121L, 158122L, 158122L, 13322L, 13322L, 158121L,
158122L, 11592L, 13189L, 13191L, 157186L, 157192L, 158009L,
158010L, 158121L, 158121L, 158122L, 158122L, 13322L, 13322L,
158121L, 158122L, 11514L, 11592L, 11623L, 13189L, 13191L),
SaleCount = c(10L, 35L, 340L, 260L, 3L, 31L, 420L, 380L,
45L, 135L, 852L, 1L, 34L, 360L, 140L, 14L, 62L, 501L, 0L,
560L, 640L, 0L, 0L, 16L, 0L, 0L, 15L, 66L, 542L, 49L, 228L,
3360L, 5720L, 980L, 0L, 0L, 1280L, 9L, 29L, 200L, 120L, 46L,
68L, 569L, 52L, 250L, 2360L, 3140L, 1640L, 0L, 0L, 1820L,
5L, 33L, 260L, 220L, 665L, 25L, -10L, 62L, 281L), DocumentNum = c(36L,
4L, 41L, 41L, 36L, 4L, 41L, 41L, 33L, 33L, 33L, 36L, 4L,
41L, 41L, 33L, 33L, 33L, 63L, 62L, 62L, 63L, 36L, 4L, 41L,
41L, 33L, 33L, 33L, 57L, 56L, 12L, 12L, 62L, 63L, 63L, 62L,
36L, 4L, 41L, 41L, 33L, 33L, 33L, 57L, 56L, 12L, 12L, 62L,
63L, 63L, 62L, 36L, 4L, 41L, 41L, 60L, 33L, 71L, 33L, 33L
), DocumentYear = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L), IsPromo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("CustomerName", "ItemRelation",
"SaleCount", "DocumentNum", "DocumentYear", "IsPromo"), class = "data.frame", row.names = c(NA,
-61L))

ggplot2 and first data point in a line

I am creating two plots using ggplot2 and then using grid.arrange to merge them together. I should say that both of the plots are also using facet_grid for a visual tweaking.
My problem is that the bottom plot, which is really a data table, ends up being "cut off" on the BOTH the left and right sides because of the starting position and ending positions for the facets. Is there a way for me to tweak this? I would like to tweak this so the points are not getting cut off.
Here is the data to reproduce it:
df <- structure(list(SurveyID = c(16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L,
47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L, 47L,
56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L,
56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 56L, 76L, 76L,
76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L,
76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 76L, 83L, 83L, 83L, 83L
), MEPSID = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L
), ServiceID = structure(c(1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L), .Label = c("Army", "Navy", "Marines", "Air Force"
), class = "factor"), SurveyReturnedYear = c(2012L, 2012L, 2012L,
2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L, 2012L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L, 2013L,
2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2014L,
2014L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L, 2015L,
2015L, 2015L, 2015L, 2015L), SurveyReturnedMonth = c(10L, 10L,
10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L, 1L, 1L, 1L,
1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L,
5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L, 9L, 9L, 9L,
9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 12L, 12L, 12L, 12L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L,
5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 8L,
9L, 9L, 9L, 9L, 10L, 10L, 10L, 10L, 11L, 11L, 11L, 11L, 12L,
12L, 12L, 12L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L, 3L, 3L, 3L,
4L, 4L, 4L, 4L), CompletedSurvey = c(23L, 19L, 38L, 16L, 11L,
16L, 38L, 19L, 6L, 14L, 41L, 10L, 6L, 32L, 46L, 18L, 12L, 30L,
35L, 18L, 11L, 32L, 23L, 19L, 8L, 24L, 46L, 19L, 18L, 28L, 30L,
19L, 12L, 27L, 32L, 15L, 20L, 31L, 34L, 26L, 30L, 25L, 26L, 17L,
41L, 16L, 24L, 12L, 43L, 23L, 22L, 15L, 29L, 21L, 22L, 18L, 38L,
10L, 20L, 13L, 46L, 19L, 19L, 9L, 32L, 10L, 17L, 27L, 31L, 21L,
17L, 18L, 30L, 18L, 19L, 20L, 22L, 23L, 17L, 17L, 34L, 21L, 16L,
4L, 34L, 29L, 20L, 18L, 25L, 21L, 24L, 19L, 15L, 16L, 18L, 13L,
28L, 19L, 24L, 0L, 23L, 13L, 13L, 2L, 34L, 13L, 22L, 4L, 17L,
26L, 5L, 17L, 27L, 18L, 30L, 0L, 30L, 11L, 34L, 0L, 27L, 9L,
34L, 0L), TotalSurvey = c(41L, 19L, 47L, 22L, 43L, 21L, 49L,
23L, 39L, 16L, 44L, 11L, 49L, 34L, 56L, 33L, 39L, 33L, 42L, 21L,
50L, 37L, 56L, 23L, 34L, 26L, 53L, 19L, 36L, 32L, 44L, 21L, 38L,
27L, 49L, 18L, 41L, 34L, 58L, 26L, 37L, 25L, 40L, 21L, 44L, 17L,
51L, 16L, 51L, 24L, 32L, 22L, 34L, 21L, 37L, 20L, 44L, 10L, 36L,
18L, 59L, 21L, 35L, 13L, 46L, 12L, 44L, 29L, 49L, 21L, 36L, 18L,
47L, 19L, 41L, 21L, 29L, 23L, 40L, 20L, 39L, 21L, 38L, 4L, 41L,
30L, 54L, 21L, 30L, 22L, 56L, 24L, 19L, 16L, 49L, 25L, 34L, 22L,
54L, 20L, 33L, 14L, 40L, 10L, 37L, 14L, 43L, 23L, 27L, 30L, 40L,
22L, 34L, 19L, 37L, 23L, 32L, 19L, 37L, 26L, 35L, 11L, 37L, 31L
), meps_labels = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), .Label = c("Albany", "Albuquerque", "Amarillo",
"Anchorage", "Atlanta", "Baltimore", "Beckley", "Boise", "Boston",
"Buffalo", "Butte", "Charlotte", "Chicago", "Cleveland", "Columbus",
"Dallas", "Denver", "Des Moines", "Detroit", "El Paso", "Fargo",
"Fort Dix", "Fort Jackson", "Fort Lee", "Harrisburg", "Honolulu",
"Houston", "Indianapolis", "Jackson", "Jacksonville", "Kansas City",
"Knoxville", "Lansing", "Little Rock", "Los Angeles", "Louisville",
"Memphis", "Miami", "Milwaukee", "Minneapolis", "Montgomery",
"Nashville", "New Orleans", "New York", "Oklahoma City", "Omaha",
"Phoenix", "Pittsburgh", "Portland, ME", "Portland, OR", "Raleigh",
"Sacramento", "Salt Lake City", "San Antonio", "San Diego", "San Jose",
"San Juan", "Seattle", "Shreveport", "Sioux Falls", "Spokane",
"Springfield", "St. Louis", "Syracuse", "Tampa"), class = "factor"),
RR = c(56, 100, 81, 73, 26, 76, 78, 83, 15, 88, 93, 91, 12,
94, 82, 55, 31, 91, 83, 86, 22, 86, 41, 83, 24, 92, 87, 100,
50, 88, 68, 90, 32, 100, 65, 83, 49, 91, 59, 100, 81, 100,
65, 81, 93, 94, 47, 75, 84, 96, 69, 68, 85, 100, 59, 90,
86, 100, 56, 72, 78, 90, 54, 69, 70, 83, 39, 93, 63, 100,
47, 100, 64, 95, 46, 95, 76, 100, 42, 85, 87, 100, 42, 100,
83, 97, 37, 86, 83, 95, 43, 79, 79, 100, 37, 52, 82, 86,
44, 0, 70, 93, 32, 20, 92, 93, 51, 17, 63, 87, 12, 77, 79,
95, 81, 0, 94, 58, 92, 0, 77, 82, 92, 0), Time = structure(c(15614,
15614, 15614, 15614, 15645, 15645, 15645, 15645, 15675, 15675,
15675, 15675, 15706, 15706, 15706, 15706, 15737, 15737, 15737,
15737, 15765, 15765, 15765, 15765, 15796, 15796, 15796, 15796,
15826, 15826, 15826, 15826, 15857, 15857, 15857, 15857, 15887,
15887, 15887, 15887, 15918, 15918, 15918, 15918, 15949, 15949,
15949, 15949, 15979, 15979, 15979, 15979, 16010, 16010, 16010,
16010, 16040, 16040, 16040, 16040, 16071, 16071, 16071, 16071,
16102, 16102, 16102, 16102, 16130, 16130, 16130, 16130, 16161,
16161, 16161, 16161, 16191, 16191, 16191, 16191, 16222, 16222,
16222, 16222, 16252, 16252, 16252, 16252, 16283, 16283, 16283,
16283, 16314, 16314, 16314, 16314, 16344, 16344, 16344, 16344,
16375, 16375, 16375, 16375, 16405, 16405, 16405, 16405, 16436,
16436, 16436, 16436, 16467, 16467, 16467, 16467, 16495, 16495,
16495, 16495, 16526, 16526, 16526, 16526), class = "Date"),
Year = c("2012", "2012", "2012", "2012", "2012", "2012",
"2012", "2012", "2012", "2012", "2012", "2012", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2013", "2013",
"2013", "2013", "2013", "2013", "2013", "2013", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2014", "2014",
"2014", "2014", "2014", "2014", "2014", "2014", "2015", "2015",
"2015", "2015", "2015", "2015", "2015", "2015", "2015", "2015",
"2015", "2015", "2015", "2015", "2015", "2015")), .Names = c("SurveyID",
"MEPSID", "ServiceID", "SurveyReturnedYear", "SurveyReturnedMonth",
"CompletedSurvey", "TotalSurvey", "meps_labels", "RR", "Time",
"Year"), row.names = c(1L, 2L, 3L, 4L, 261L, 262L, 263L, 264L,
521L, 522L, 523L, 524L, 781L, 782L, 783L, 784L, 1041L, 1042L,
1043L, 1044L, 1301L, 1302L, 1303L, 1304L, 1561L, 1562L, 1563L,
1564L, 1821L, 1822L, 1823L, 1824L, 2081L, 2082L, 2083L, 2084L,
2341L, 2342L, 2343L, 2344L, 2601L, 2602L, 2603L, 2604L, 2861L,
2862L, 2863L, 2864L, 3121L, 3122L, 3123L, 3124L, 3381L, 3382L,
3383L, 3384L, 3641L, 3642L, 3643L, 3644L, 3901L, 3902L, 3903L,
3904L, 4161L, 4162L, 4163L, 4164L, 4421L, 4422L, 4423L, 4424L,
4681L, 4682L, 4683L, 4684L, 4941L, 4942L, 4943L, 4944L, 5201L,
5202L, 5203L, 5204L, 5461L, 5462L, 5463L, 5464L, 5721L, 5722L,
5723L, 5724L, 5981L, 5982L, 5983L, 5984L, 6241L, 6242L, 6243L,
6244L, 6501L, 6502L, 6503L, 6504L, 6761L, 6762L, 6763L, 6764L,
7021L, 7022L, 7023L, 7024L, 7281L, 7282L, 7283L, 7284L, 7541L,
7542L, 7543L, 7544L, 7801L, 7802L, 7803L, 7804L), class = "data.frame")
And the code:
library(ggplot2)
library(grid)
library(scales)
library(gridExtra)
p<- ggplot(data=df[df$MEPSID==1,],
aes(x=Time, y=RR, colour=ServiceID, group=ServiceID, label=round(RR)))+
scale_y_continuous(breaks=seq(0, 100, 10))+
labs(y="Response Rate")+
coord_cartesian(ylim=c(0, 110))+
geom_line(size=.5)+
geom_point()+
scale_color_manual(values=c("green4","blue4","red4","dodgerblue"))+
ggtitle("Counts")+
theme(plot.title=element_text(size=18, face="bold", vjust=1),
axis.title=element_text(size=16),
axis.text.x=element_text(size=10, angle=90),
axis.line=element_line(colour="black", size=.2),
legend.background = element_rect(fill="transparent"),
legend.position="top",
legend.title=element_blank(),
legend.margin=unit(-0.6, "cm"),
legend.position="none",
legend.text=element_text(size=14),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank(),
panel.grid.minor.y = element_blank(),
panel.background = element_blank(),
panel.grid.major.y=element_line(colour="gray", linetype="solid", size=.2))+ # or theme_blank())
scale_x_date(labels = date_format("%b"), breaks=date_breaks("month"))+
facet_grid(~Year, scales="free", space="free")
p2<-ggplot(df[df$MEPSID==1,], aes(x = Time, y = ServiceID, label=format(round(RR), nsmall=0), colour = ServiceID)) +
geom_text(size = 3.5) +
theme(
panel.grid.major = element_blank(),
legend.position = "none",
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(),
axis.ticks = element_blank(),
plot.margin = unit(c(-0.5,1, 0, 0.5), "lines")) +
xlab(NULL) +
ylab(NULL)+
scale_x_date(labels=c(), breaks=date_breaks("month"), expand=c(0.05,0.05))+
facet_grid(~Year, scales="free", space="free_x")+
scale_y_discrete(limits=rev(levels(df$ServiceID)))+
scale_color_manual(values=c("green4","blue4","red4","dodgerblue"))
grid.arrange(arrangeGrob(p,p2,
nrow=2, heights=c(5,1)))
You can use geom_blank to fine-tune facetted scales.
grid.arrange(p + geom_blank(data = data.frame(Time = as.Date(c("2012-09-20", "2012-12-15",
"2014-12-20", "2015-04-10")),
RR = 1:4,
Year = c(2012, 2012, 2015, 2015)),
aes(colour = NULL, group = NULL, label = NULL)) ,
p2 + geom_blank(data = data.frame(Time = as.Date(c("2012-09-20", "2012-12-15",
"2014-12-20", "2015-04-10")),
ServiceID = 1:4,
Year = c(2012, 2012, 2015, 2015)),
aes(colour = NULL, group = NULL, label = NULL)) ,
nrow=2, heights=c(5,1))
Another option is to adjust text using hjust argument as an aes. But first you should add it to the data as its own column that you will pass into the ggplot command :
library(data.table)
DX <- setDT(df[df$MEPSID==1,])
DX[,hjust:=ifelse(Time==min(Time),0.1,ifelse(Time==max(Time),0.8,0.4)),Year] #This creates a new variable called hjust
p2<-ggplot(DX,
aes(x = Time, y = ServiceID, label=format(round(RR), nsmall=0),
colour = ServiceID,hjust=hjust)) +
## the rest of the plot 2
add some explanation:
Here you are plotting a text using (Time versus ServiceID) by year.
Since we want to shift our text horizontally, we will do it according to the value of Time (x-coordinate). More precisely, will just shift left-points to the right and right-points to the left. This will be done by setting a different hjust value for each group of values ( left vs right).
So for each year( each facet ) , I will horizontally adjust the points corresponding to the min of Time ( the extreme left points of the facets), and the max of time ( the extreme right points of the facets). No need to adjust other points even I do it here.
DX[,hjust:=ifelse(Time==min(Time),0.1, ## extreme left point
ifelse(Time==max(Time),0.8, ## extreme right points
0.4)), ## others
Year] ## for each facet
You can do the trsnformation in base R using ave:
ave(as.numeric(xx$Time),xx$Year,
FUN=function(x)
ifelse(x==min(x),0.1,ifelse(x==max(x),0.8,0.4)))

inconsistent datetime difference output

I am simply trying to calculate difference between observations for each group.
Dataset:
structure(list(IDYEAR = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "A0712002", class = "factor"),
MONTH = c(12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L,
12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L, 12L), DAY = c(5L,
6L, 6L, 7L, 8L, 8L, 9L, 9L, 10L, 12L, 12L, 13L, 13L, 13L,
14L, 14L, 14L, 15L, 15L), YEAR = c(2002L, 2002L, 2002L, 2002L,
2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L, 2002L,
2002L, 2002L, 2002L, 2002L, 2002L, 2002L), HOUR = c(9L, 19L,
23L, 1L, 1L, 3L, 19L, 21L, 17L, 17L, 19L, 17L, 19L, 23L,
3L, 9L, 19L, 3L, 11L), MINUTE = c(43L, 43L, 43L, 42L, 42L,
42L, 42L, 43L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L, 42L,
42L, 42L), SECOND = c(24L, 13L, 13L, 41L, 54L, 54L, 54L,
12L, 54L, 54L, 48L, 43L, 59L, 55L, 43L, 44L, 54L, 43L, 55L
), DATETIME = structure(c(12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 1L, 2L, 3L, 4L, 5L, 6L, 8L, 9L, 7L, 11L, 10L), .Label = c("12/10/2002 17:42",
"12/12/2002 17:42", "12/12/2002 19:42", "12/13/2002 17:42",
"12/13/2002 19:42", "12/13/2002 23:42", "12/14/2002 19:42",
"12/14/2002 3:42", "12/14/2002 9:42", "12/15/2002 11:42",
"12/15/2002 3:42", "12/5/2002 9:43", "12/6/2002 19:43", "12/6/2002 23:43",
"12/7/2002 1:42", "12/8/2002 1:42", "12/8/2002 3:42", "12/9/2002 19:42",
"12/9/2002 21:43"), class = "factor"), GRP1700 = c(873L,
873L, 874L, 875L, 875L, 876L, 876L, 876L, 876L, 876L, 877L,
877L, 877L, 877L, 877L, 878L, 878L, 878L, 879L), ID1700 = structure(c(1L,
1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L,
6L, 6L, 7L), .Label = c("A0712002873", "A0712002874", "A0712002875",
"A0712002876", "A0712002877", "A0712002878", "A0712002879"
), class = "factor")), .Names = c("IDYEAR", "MONTH", "DAY",
"YEAR", "HOUR", "MINUTE", "SECOND", "DATETIME", "GRP1700", "ID1700"
), class = "data.frame", row.names = c(NA, -19L))
Code
rm(list = ls())
dfa1<-read.csv("test.csv")
head(dfa1)
dput(dfa1)
dfa1[["TESTDATE"]]<-as.POSIXct(dfa1$DATETIME,format="%m/%d/%Y %H:%M",tz="GMT")
dfa1$ID1700<-as.factor(dfa1$ID1700)
dfa1<-dfa1 %>%
arrange(IDYEAR, GRP1700, TESTDATE) %>%
group_by(ID1700) %>%
mutate(TIME1700 = TESTDATE - lag (TESTDATE))
write.csv(dfa1, "test2.csv")
Output:
TESTDATE TIME1700
1 2002-12-05 09:43:00 NA days
2 2002-12-06 19:43:00 1.416667 days
3 2002-12-06 23:43:00 NA days
4 2002-12-07 01:42:00 NA days
5 2002-12-08 01:42:00 1.000000 days
6 2002-12-08 03:42:00 NA days
7 2002-12-09 19:42:00 40.000000 days
8 2002-12-09 21:43:00 2.016667 days
9 2002-12-10 17:42:00 19.983333 days
10 2002-12-12 17:42:00 48.000000 days
11 2002-12-12 19:42:00 NA days
12 2002-12-13 17:42:00 22.000000 days
13 2002-12-13 19:42:00 2.000000 days
14 2002-12-13 23:42:00 4.000000 days
15 2002-12-14 03:42:00 4.000000 days
16 2002-12-14 09:42:00 NA days
17 2002-12-14 19:42:00 10.000000 days
18 2002-12-15 03:42:00 8.000000 days
19 2002-12-15 11:42:00 NA days
I noticed that some of the output is in hours (line 8 - line 7) whereas some of the output is in days (line 5 - line 4). How can I recieve consistent output (hours would be preferred)? Thanks in advance.

Resources