Conducting regression analysis using R via SQL Server 2017 - r

I want perform regression analysis using R code via SQL Server 2017 (it's integrated here).
Here is the native R code working with the csv
The main matter of code that we perform regression separately by groups [CustomerName]+[ItemRelation]+[DocumentNum]+[DocumentYear]
df=read.csv("C:/Users/synthex/Desktop/re.csv", sep=";",dec=",")
#load needed library
library(tidyverse)
library(broom)
#order dataset
df=df[ order(df[,5]),]
df=df[ order(df[,6]),]
#delete signs
df$Customer<-gsub("\\-","",df$Customer)
#create lm function for separately by group regression
my_lm <- function(df) {
lm(SaleCount~IsPromo, data = df)
}
reg=df %>%
group_by(CustomerName,ItemRelation,DocumentNum,DocumentYear) %>%
nest() %>%
mutate(fit = map(data, my_lm),
tidy = map(fit, tidy)) %>%
select(-fit, - data) %>%
unnest()
w=aggregate(df$action, by=list(CustomerName=df$CustomerName,ItemRelation=df$ItemRelation, DocumentNum=df$DocumentNum, DocumentYear=df$DocumentYear), FUN=sum)
View(w)
# multiply each group by the number of days of the action
EA<-data.frame(reg$CustomerName,reg$ItemRelation,reg$DocumentNum,reg$DocumentYear, reg$estimate*w$x)
#del intercepts
toDelete <- seq(2, nrow(EA), 2)
newdat=EA[ toDelete ,]
View(newdat)
The finished result: this code runs in SSMS
So what I did:
EXECUTE sp_execute_external_script
#language = N'R'
, #script = N' OutputDataSet <- InputDataSet;'
, #input_data_1 = N' SELECT [CustomerName]
,[ItemRelation]
,[SaleCount]
,[DocumentNum]
,[DocumentYear]
,[IsPromo]
FROM [Action].[dbo].[promo_data];'
WITH RESULT SETS (([CustomerName] nvarchar(max) NOT NULL, [ItemRelation] int NOT NULL,
[SaleCount] int NOT NULL,[DocumentNum] int NOT NULL,
[DocumentYear] int NOT NULL, [IsPromo] int NOT NULL));
df=as.data.frame(InputDataSet)
Message 102, level 15, state 1, line 17
Incorrect syntax near the "=" construct.
So, how perform regression analysis in SQL separately by groups?
Note, all coefficients must be saved, because new data come to the sql, should already automatically calculate by the equation of constructed model for each group.
The above code simply estimates the impact of the action, the beta coefficients of each group multiplies by the number of days of the action for each group.
If it is needed, here is a reproducible example:
df=structure(list(CustomerName = structure(c(1L, 2L, 3L, 3L, 1L,
2L, 3L, 3L, 4L, 4L, 4L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
1L, 2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 1L,
2L, 3L, 3L, 4L, 4L, 4L, 4L, 4L), .Label = c("Attacks of the vehicle",
"Auchan TS", "Tape of the vehicle", "X5 Retail Group"), class = "factor"),
ItemRelation = c(13322L, 13322L, 158121L, 158122L, 13322L,
13322L, 158121L, 158122L, 11592L, 13189L, 13191L, 13322L,
13322L, 158121L, 158122L, 11592L, 13189L, 13191L, 158121L,
158121L, 158122L, 158122L, 13322L, 13322L, 158121L, 158122L,
11592L, 13189L, 13191L, 157186L, 157192L, 158009L, 158010L,
158121L, 158121L, 158122L, 158122L, 13322L, 13322L, 158121L,
158122L, 11592L, 13189L, 13191L, 157186L, 157192L, 158009L,
158010L, 158121L, 158121L, 158122L, 158122L, 13322L, 13322L,
158121L, 158122L, 11514L, 11592L, 11623L, 13189L, 13191L),
SaleCount = c(10L, 35L, 340L, 260L, 3L, 31L, 420L, 380L,
45L, 135L, 852L, 1L, 34L, 360L, 140L, 14L, 62L, 501L, 0L,
560L, 640L, 0L, 0L, 16L, 0L, 0L, 15L, 66L, 542L, 49L, 228L,
3360L, 5720L, 980L, 0L, 0L, 1280L, 9L, 29L, 200L, 120L, 46L,
68L, 569L, 52L, 250L, 2360L, 3140L, 1640L, 0L, 0L, 1820L,
5L, 33L, 260L, 220L, 665L, 25L, -10L, 62L, 281L), DocumentNum = c(36L,
4L, 41L, 41L, 36L, 4L, 41L, 41L, 33L, 33L, 33L, 36L, 4L,
41L, 41L, 33L, 33L, 33L, 63L, 62L, 62L, 63L, 36L, 4L, 41L,
41L, 33L, 33L, 33L, 57L, 56L, 12L, 12L, 62L, 63L, 63L, 62L,
36L, 4L, 41L, 41L, 33L, 33L, 33L, 57L, 56L, 12L, 12L, 62L,
63L, 63L, 62L, 36L, 4L, 41L, 41L, 60L, 33L, 71L, 33L, 33L
), DocumentYear = c(2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L), IsPromo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("CustomerName", "ItemRelation",
"SaleCount", "DocumentNum", "DocumentYear", "IsPromo"), class = "data.frame", row.names = c(NA,
-61L))

Related

ANOVA error: why is each row of output *not* identified by a unique combination of keys?

I have a two-way ANOVA test (w/repeated measures) that I'm using with four almost identical datasets:
> res.aov <- anova_test(
+ data = LST_Weather_dataset_N, dv = LST, wid = Month,
+ within = c(Buffer, TimePeriod),
+ effect.size = "ges",
+ detailed = TRUE,
+ )
Where:
LST = surface temperature deviation in C
Month = 1-12
Buffer = a value 100-1900 - one of 19 areas outward from the boundary of a solar power plant (each 100m wide)
TimePeriod = a factor with a value of 1 or 2 corresponding to pre-/post-construction of a solar power plant.
For one dataset I get the error:
Error: Each row of output must be identified by a unique combination of keys.
Keys are shared for 38 rows:
* 10, 11
* 217, 218
* 240, 241
* 263, 264
* 286, 287
* 309, 310
* 332, 333
...
As far as I can tell I have unique combinations.
dplyr::count(LST_Weather_dataset_N, LST, Month, Buffer, TimePeriod, sort = TRUE)
returns
LST Month Buffer TimePeriod n
1 -6.309045316 12 100 2 1
2 -5.655279925 9 1000 2 1
3 -5.224196295 12 200 2 1
4 -5.194473224 9 1100 2 1
5 -5.025429891 12 400 2 1
6 -4.987575966 9 700 2 1
7 -4.979453868 12 600 2 1
8 -4.825298768 12 300 2 1
9 -4.668994574 12 500 2 1
10 -4.652282192 12 700 2 1
...
'n' is always 1.
I can't work out why this is happening.
Extract of datafram below:
> dput(LST_Weather_dataset_N[sample(1:nrow(LST_Weather_dataset_N), 50),])
structure(list(Buffer = c(1400L, 700L, 300L, 1400L, 100L, 200L,
1700L, 100L, 800L, 1900L, 1100L, 100L, 700L, 800L, 1400L, 400L,
1300L, 200L, 1200L, 500L, 1200L, 1300L, 400L, 1000L, 1300L, 1100L,
100L, 300L, 300L, 600L, 1100L, 1400L, 1500L, 1600L, 1700L, 1800L,
1700L, 1300L, 1200L, 300L, 1100L, 1900L, 1700L, 700L, 1400L,
1200L, 1600L, 1700L, 1900L, 1300L), Date = c("02/05/2014", "18/01/2017",
"19/06/2014", "25/12/2013", "15/09/2017", "08/04/2017", "22/08/2014",
"21/07/2014", "13/07/2017", "25/12/2013", "22/10/2013", "02/05/2014",
"07/03/2017", "15/03/2014", "13/07/2017", "19/06/2014", "25/12/2013",
"17/10/2017", "16/04/2014", "06/10/2013", "15/09/2017", "18/01/2017",
"10/01/2014", "17/12/2016", "13/07/2017", "19/06/2014", "07/03/2017",
"15/03/2014", "11/02/2014", "22/10/2013", "06/10/2013", "15/09/2017",
"16/04/2014", "18/01/2017", "15/03/2014", "21/07/2014", "17/10/2017",
"15/09/2017", "10/01/2014", "23/09/2014", "16/04/2014", "22/10/2013",
"11/06/2017", "26/05/2017", "19/06/2014", "14/08/2017", "11/02/2014",
"26/02/2017", "26/02/2017", "11/02/2014"), LST = c(1.255502397,
4.33385966, 3.327025603, -0.388631166, -0.865430798, 4.386292648,
-0.243018665, 3.276865987, 0.957036835, -0.065821795, 0.69731779,
4.846851651, -1.437700684, 1.003808572, 0.572460421, 2.995902374,
-0.334633662, -1.231447567, 0.644520741, 0.808262029, -3.392959991,
2.324569449, 2.346707612, -3.124354627, 0.58719862, 1.904859254,
1.701580958, 2.792443253, 1.638270039, 1.460743317, 0.699767335,
-3.015643366, 0.930527864, 1.309519336, 0.477789664, 0.147584938,
-0.498188865, -3.506795723, -1.007487965, 1.149604087, 1.192366386,
0.197471474, 0.999391224, -0.190613618, 1.27324015, 2.686622796,
0.573109026, 0.97847983, 0.395005095, -0.40855426), Month = c(5L,
1L, 6L, 12L, 9L, 4L, 8L, 7L, 7L, 12L, 10L, 5L, 3L, 3L, 7L, 6L,
12L, 10L, 4L, 10L, 9L, 1L, 1L, 12L, 7L, 6L, 3L, 3L, 2L, 10L,
10L, 9L, 4L, 1L, 3L, 7L, 10L, 9L, 1L, 9L, 4L, 10L, 6L, 5L, 6L,
8L, 2L, 2L, 2L, 2L), Year = c(2014L, 2017L, 2014L, 2013L, 2017L,
2017L, 2014L, 2014L, 2017L, 2013L, 2013L, 2014L, 2017L, 2014L,
2017L, 2014L, 2013L, 2017L, 2014L, 2013L, 2017L, 2017L, 2014L,
2016L, 2017L, 2014L, 2017L, 2014L, 2014L, 2013L, 2013L, 2017L,
2014L, 2017L, 2014L, 2014L, 2017L, 2017L, 2014L, 2014L, 2014L,
2013L, 2017L, 2017L, 2014L, 2017L, 2014L, 2017L, 2017L, 2014L
), JulianDay = c(122L, 18L, 170L, 359L, 258L, 98L, 234L, 202L,
194L, 359L, 295L, 122L, 66L, 74L, 194L, 170L, 359L, 290L, 106L,
279L, 258L, 18L, 10L, 352L, 194L, 170L, 66L, 74L, 42L, 295L,
279L, 258L, 106L, 18L, 74L, 202L, 290L, 258L, 10L, 266L, 106L,
295L, 162L, 146L, 170L, 226L, 42L, 57L, 57L, 42L), TimePeriod = c(1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 2L, 1L, 1L, 1L, 2L, 1L, 2L, 1L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 2L, 2L, 1L, 2L, 1L, 1L, 1L, 1L, 2L, 1L,
2L, 1L, 1L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 1L, 2L, 1L, 2L, 2L,
1L), Temperature = c(28L, 9L, 31L, 12L, 27L, 21L, 29L, 36L, 38L,
12L, 23L, 28L, 12L, 21L, 38L, 31L, 12L, 23L, 25L, 22L, 27L, 9L,
11L, 7L, 38L, 31L, 12L, 21L, 14L, 23L, 22L, 27L, 25L, 9L, 21L,
36L, 23L, 27L, 11L, 31L, 25L, 23L, 29L, 27L, 31L, 34L, 14L, 16L,
16L, 14L), Humidity = c(6L, 34L, 7L, 31L, 29L, 22L, 34L, 15L,
19L, 31L, 16L, 6L, 14L, 14L, 19L, 7L, 31L, 12L, 9L, 12L, 29L,
34L, 33L, 18L, 19L, 7L, 14L, 14L, 31L, 16L, 12L, 29L, 9L, 34L,
14L, 15L, 12L, 29L, 33L, 18L, 9L, 16L, 8L, 13L, 7L, 13L, 31L,
31L, 31L, 31L), Wind_speed = c(6L, 0L, 6L, 7L, 13L, 33L, 6L,
20L, 9L, 7L, 0L, 6L, 0L, 6L, 9L, 6L, 7L, 6L, 0L, 7L, 13L, 0L,
0L, 35L, 9L, 6L, 0L, 6L, 6L, 0L, 7L, 13L, 0L, 0L, 6L, 20L, 6L,
13L, 0L, 0L, 0L, 0L, 24L, 11L, 6L, 24L, 6L, 26L, 26L, 6L), Wind_gust = c(0L,
0L, 0L, 0L, 0L, 54L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 46L, 0L, 0L, 0L, 0L, 0L, 0L, 48L, 0L, 0L, 39L,
0L, 41L, 41L, 0L), Wind_trend = c(1L, 0L, 1L, 1L, 2L, 2L, 0L,
1L, 2L, 1L, 0L, 1L, 0L, 1L, 2L, 1L, 1L, 0L, 0L, 2L, 2L, 0L, 1L,
1L, 2L, 1L, 0L, 1L, 1L, 0L, 2L, 2L, 0L, 0L, 1L, 1L, 0L, 2L, 1L,
1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), Wind_direction = c(0,
0, 0, 337.5, 360, 22.5, 0, 22.5, 0, 337.5, 0, 0, 0, 0, 0, 0,
337.5, 180, 0, 247.5, 360, 0, 0, 180, 0, 0, 0, 0, 337.5, 0, 247.5,
360, 0, 0, 0, 22.5, 180, 360, 0, 0, 0, 0, 360, 22.5, 0, 360,
337.5, 360, 360, 337.5), Pressure = c(940.2, 943.64, 937.69,
951.37, 932.69, 933.94, 937.07, 938.01, 937.69, 951.37, 939.72,
940.2, 948.33, 947.71, 937.69, 937.69, 951.37, 943.32, 932.69,
944.71, 932.69, 943.64, 942.31, 943.01, 937.69, 937.69, 948.33,
947.71, 941.94, 939.72, 944.71, 932.69, 932.69, 943.64, 947.71,
938.01, 943.32, 932.69, 942.31, 938.94, 932.69, 939.72, 928.31,
931.12, 937.69, 932.37, 941.94, 936.13, 936.13, 941.94), Pressure_trend = c(1L,
2L, 0L, 2L, 0L, 1L, 1L, 1L, 1L, 2L, 1L, 1L, 1L, 1L, 1L, 0L, 2L,
1L, 2L, 1L, 0L, 2L, 2L, 2L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 2L,
2L, 1L, 1L, 1L, 0L, 2L, 1L, 2L, 1L, 0L, 0L, 0L, 1L, 1L, 2L, 2L,
1L)), row.names = c(179L, 14L, 195L, 426L, 306L, 118L, 299L,
229L, 244L, 436L, 374L, 153L, 90L, 91L, 256L, 197L, 424L, 348L,
137L, 355L, 328L, 26L, 7L, 419L, 254L, 211L, 78L, 81L, 43L, 359L,
373L, 332L, 143L, 32L, 109L, 263L, 393L, 330L, 23L, 309L, 135L,
398L, 224L, 166L, 217L, 290L, 69L, 72L, 76L, 63L), class = "data.frame")
Well, this is a bit embarrassing.
The error arose as there were not, in fact, paired months of the data. Rather than there being 38 data (19x2) for each month, due to an error in determining the month value one month had 57 data (19x3). Correcting this, and checking that each month had the same number of paired data for the ANOVA allowed the test to run sucessfully.
> res.aov <- anova_test(
+ data = LST_Weather_dataset_N, dv = LST, wid = Month,
+ within = c(Buffer, TimePeriod),
+ effect.size = "ges",
+ detailed = TRUE,
+ )
> get_anova_table(res.aov, correction = "auto")
ANOVA Table (type III tests)
Effect DFn DFd SSn SSd F p p<.05 ges
1 (Intercept) 1 11 600.135 974.584 6.774 2.50e-02 * 0.189
2 Buffer 18 198 332.217 331.750 11.015 2.05e-21 * 0.115
3 TimePeriod 1 11 29.561 977.945 0.333 5.76e-01 0.011
4 Buffer:TimePeriod 18 198 13.055 283.797 0.506 9.53e-01 0.005
I still don't understand how the error message was telling me this, though.

how to calculate the median for groups separately in R

Little example of data
df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L,
22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L,
35L, 36L, 37L, 38L, 39L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L), .Label = c("2018-02-20 00:00:00.000", "2018-02-21 00:00:00.000",
"2018-02-22 00:00:00.000", "2018-02-23 00:00:00.000", "2018-02-24 00:00:00.000",
"2018-02-25 00:00:00.000", "2018-02-26 00:00:00.000", "2018-02-27 00:00:00.000",
"2018-02-28 00:00:00.000", "2018-03-01 00:00:00.000", "2018-03-02 00:00:00.000",
"2018-03-03 00:00:00.000", "2018-03-04 00:00:00.000", "2018-03-05 00:00:00.000",
"2018-03-06 00:00:00.000", "2018-03-07 00:00:00.000", "2018-03-08 00:00:00.000",
"2018-03-09 00:00:00.000", "2018-03-10 00:00:00.000", "2018-03-11 00:00:00.000",
"2018-03-12 00:00:00.000", "2018-03-13 00:00:00.000", "2018-03-14 00:00:00.000",
"2018-03-15 00:00:00.000", "2018-03-16 00:00:00.000", "2018-03-17 00:00:00.000",
"2018-03-18 00:00:00.000", "2018-03-19 00:00:00.000", "2018-03-20 00:00:00.000",
"2018-03-21 00:00:00.000", "2018-03-22 00:00:00.000", "2018-03-23 00:00:00.000",
"2018-03-24 00:00:00.000", "2018-03-25 00:00:00.000", "2018-03-26 00:00:00.000",
"2018-03-27 00:00:00.000", "2018-03-28 00:00:00.000", "2018-03-29 00:00:00.000",
"2018-03-30 00:00:00.000"), class = "factor"), ItemRelation = c(158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 234L, 234L, 234L, 234L, 234L, 234L,
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L,
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L,
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L
), stuff = c(200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L,
0L, 0L, 0L, 0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1000L, 2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L,
200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L, 0L, 0L, 0L,
0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1000L,
2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L), num = c(1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L), year = c(2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L), action = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L)), .Names = c("Dt", "ItemRelation",
"stuff", "num", "year", "action"), class = "data.frame", row.names = c(NA,
-78L))
now for each group
ItemRelation +num +year i have to calculate the median.
If i use this solution
# df with action 0 and stuff > 0
v <- df$stuff[intersect(which(df$action == 0),
which(df$stuff > 0))]
# df with action 1 and stuff > 0
w <- df$stuff[intersect(which(df$action == 1),
which(df$stuff > 0))]
# calulating the median of v for the last 5 observations
l <- length(v)
m0 <- median(v[(l-4):l]) # taking the median of the last 5 observations
# computing the final difference
m <- median(w) - m0
i calculate median for all group at once, but i have to calculate for
each group separately.
How can i perform it?
here expected output
ItemRelation num year value
158043 1459 2018 45
158043 234 2018 67
post edited. Note that value are not real, the medians will another, i just wanted to show what i wan as output
Edit
The action column has only two values 0 and 1. i must calculate median by stuff for 1 category of action, then median by stuff of zero category of action, using last five integer values before one category. I just take the last 5 observations, It is necessary to take the last 5 observations in the zero category of action, but only the integer value, and not calculate the median by all values of zero category. In our case this is
200
3600
700
1000
2600
then substract median of zero category from median of one category.
The number of observations by stuff in the zero category of action can vary from 0-10. If we have 10 integer values of zero category, we take last five. If there is only 1,2,3,4,5 values integer, we subtract median of real number of integer values. If we have only 0 without integer , we just substact 0.
But code must calculate the median by zero category, but 5 last obs before one category.
Note, instead of 0, there may be other values for the zero category of action.
One solution can be achieved using dplyr and following below mentioned steps. Please find comments in code below for approach.
Note: It seems that sample data from OP is not very meaningful as such.
library(dplyr)
df %>% filter(stuff > 0) %>% #First filter out for stuff > 0 which of our interest
group_by(ItemRelation, num, year) %>%
mutate(m = median(stuff[action==1]),
m0 = median(tail(stuff[action==0], 5))) %>% # Calculate m and m0 for all rows
filter(action == 1) %>% # Now keep only rows with action == 1
mutate(m = m-m0) %>%
select(-Dt,-m0,-action)
# # A tibble: 4 x 5
# # Groups: ItemRelation, num, year [2]
# ItemRelation stuff num year m
# <int> <int> <int> <int> <dbl>
# 1 158043 400 1459 2018 -450
# 2 158043 700 1459 2018 -450
# 3 234 400 1459 2018 -450
# 4 234 700 1459 2018 -450
The easiest way to do this is to use group_by and summarize from the dplyr package:
library(dplyr)
# median of groups
medians <- df %>%
group_by(ItemRelation, num, year) %>%
summarize(med = median(stuff, na.rm = T))
# median of nonzero values in each group
medians <- df %>%
filter(stuff>0) %>%
group_by(ItemRelation, num, year) %>%
summarize(med = median(stuff, na.rm = T))
subtract <- function(x){return(x[1]-x[2])}
median_diffs <- medians %>%
group_by(ItemRelation, num, year) %>%
mutate(med_diff = subtract(med))

multiply median for groups separately in R by condition

I have this dataset
df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L,
22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L,
35L, 36L, 37L, 38L, 39L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L, 21L, 22L,
23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L, 35L,
36L, 37L, 38L, 39L), .Label = c("2018-02-20 00:00:00.000", "2018-02-21 00:00:00.000",
"2018-02-22 00:00:00.000", "2018-02-23 00:00:00.000", "2018-02-24 00:00:00.000",
"2018-02-25 00:00:00.000", "2018-02-26 00:00:00.000", "2018-02-27 00:00:00.000",
"2018-02-28 00:00:00.000", "2018-03-01 00:00:00.000", "2018-03-02 00:00:00.000",
"2018-03-03 00:00:00.000", "2018-03-04 00:00:00.000", "2018-03-05 00:00:00.000",
"2018-03-06 00:00:00.000", "2018-03-07 00:00:00.000", "2018-03-08 00:00:00.000",
"2018-03-09 00:00:00.000", "2018-03-10 00:00:00.000", "2018-03-11 00:00:00.000",
"2018-03-12 00:00:00.000", "2018-03-13 00:00:00.000", "2018-03-14 00:00:00.000",
"2018-03-15 00:00:00.000", "2018-03-16 00:00:00.000", "2018-03-17 00:00:00.000",
"2018-03-18 00:00:00.000", "2018-03-19 00:00:00.000", "2018-03-20 00:00:00.000",
"2018-03-21 00:00:00.000", "2018-03-22 00:00:00.000", "2018-03-23 00:00:00.000",
"2018-03-24 00:00:00.000", "2018-03-25 00:00:00.000", "2018-03-26 00:00:00.000",
"2018-03-27 00:00:00.000", "2018-03-28 00:00:00.000", "2018-03-29 00:00:00.000",
"2018-03-30 00:00:00.000"), class = "factor"), ItemRelation = c(158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 158043L, 158043L, 158043L, 158043L,
158043L, 158043L, 158043L, 234L, 234L, 234L, 234L, 234L, 234L,
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L,
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L,
234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L, 234L
), stuff = c(200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L,
0L, 0L, 0L, 0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 1000L, 2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L,
200L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 3600L, 0L, 0L, 0L,
0L, 700L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1000L,
2600L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 400L, 700L), num = c(1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L, 1459L,
1459L, 1459L, 1459L, 1459L, 1459L), year = c(2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L, 2018L,
2018L, 2018L, 2018L), action = c(0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 1L, 1L, 1L)), .Names = c("Dt", "ItemRelation",
"stuff", "num", "year", "action"), class = "data.frame", row.names = c(NA,
-78L))
The next operation was performed on this data.
1. the operation of calculating the median for the first category of the action and the zero category of action by stuff ( last five non-zero observations).
2.then the median of the zero category was subtracted from the median in the first category.
Solution of MKR is very accurate.
library(dplyr)
df %>% filter(stuff > 0) %>% #First filter out for stuff > 0 which of our interest
group_by(ItemRelation, num, year) %>%
mutate(m = median(stuff[action==1]),
m0 = median(tail(stuff[action==0], 5))) %>% # Calculate m and m0 for all rows
filter(action == 1) %>% # Now keep only rows with action == 1
mutate(m = m-m0) %>%
select(-Dt,-m0,-action
How to do that the calculated result for each group was multiplied by the number of ones by action, but only for those that are more than zero by stuff.
For example, for stratum
ItemRelation num year
158043 1459 2018
we have 4 ones in action, and only two ones by stuff more then zero
so the calculated result (m) we multiply on two.
Data is already filter for stuff>0 in dplyr - chain. Then() represent the count per group where stuff>0 and action ==1. Hence, one can multiply the final value of m with n(). At the end, distinct will ensure that duplicate rows has been removed.
library(dplyr)
df %>% filter(stuff > 0) %>% #First filter out for stuff > 0 which of our interest
group_by(ItemRelation, num, year) %>%
mutate(m = median(stuff[action==1]),
m0 = median(tail(stuff[action==0], 5))) %>% # Calculate m and m0 for all rows
filter(action == 1) %>% # Now keep only rows with action == 1
mutate(m = (m-m0)*n()) %>%
select(-Dt,-m0,-action, - stuff) %>% distinct()
# # A tibble: 2 x 4
# # Groups: ItemRelation, num, year [2]
# ItemRelation num year m
# <int> <int> <int> <dbl>
# 1 158043 1459 2018 -900
# 2 234 1459 2018 -900

create sql expression in R for certain condition

I get the data from the sql server to perform regression analysis, and then the regression results i return back to another sql table.
library("RODBC")
library(sqldf)
dbHandle <- odbcDriverConnect("driver={SQL Server};server=MYSERVER;database=MYBASE;trusted_connection=true")
sql <-
"select
Dt
,CustomerName
,ItemRelation
,SaleCount
,DocumentNum
,DocumentYear
,IsPromo
from dbo.mytable"
df <- sqlQuery(dbHandle, sql)
After this query i must perform regression analysis separately for groups
my_lm <- function(df) {
lm(SaleCount~IsPromo, data = df)
}
reg=df %>%
group_by(CustomerName,ItemRelation,DocumentNum,DocumentYear) %>%
nest() %>%
mutate(fit = map(data, my_lm),
tidy = map(fit, tidy)) %>%
select(-fit, - data) %>%
unnest()
View(reg)
#save to sql table
sqlSave(dbHandle, as.data.frame(reg), "dbo.mytableforecast", verbose = TRUE) # use "append = TRUE" to add rows to an existing table
odbcClose(dbHandle)
The question:
The script works automatically, i.e. in the scheduler there is task that script in certain time was launched.
For example, today was loaded 100 observations.
From 01.01.2017-10.04.2017
Script performed regression and returned data to sql table.
Tomorrow will loaded new 100 observations.
11.04.2017-20.07.2017
I.E. when tomorrow the data will loaded and the script will start at 10 pm, it must work only with data from 11.04.2017-20.07.2017, and not from 01.01.2017-20.07.2017
the situation is complicated by the fact that after the regression the column Dt is dropped, so the solution given me here does not work
Automatic transfer data from the sql to R
because Dt is absent.
How can i set the condition for schedule select Dt ,CustomerName ,ItemRelation ,SaleCount ,DocumentNum ,DocumentYear ,IsPromo from dbo.mytable "where Dt>the last date when the script was launched"
is it possible to create this expression?
data example from sql
df=structure(list(Dt = structure(c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
8L, 9L, 9L, 10L, 10L, 11L, 11L, 12L, 12L, 13L, 13L, 14L, 14L,
15L, 15L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 18L, 18L, 18L,
18L, 19L), .Label = c("2017-10-12 00:00:00.000", "2017-10-13 00:00:00.000",
"2017-10-14 00:00:00.000", "2017-10-15 00:00:00.000", "2017-10-16 00:00:00.000",
"2017-10-17 00:00:00.000", "2017-10-18 00:00:00.000", "2017-10-19 00:00:00.000",
"2017-10-20 00:00:00.000", "2017-10-21 00:00:00.000", "2017-10-22 00:00:00.000",
"2017-10-23 00:00:00.000", "2017-10-24 00:00:00.000", "2017-10-25 00:00:00.000",
"2017-10-26 00:00:00.000", "2017-10-27 00:00:00.000", "2017-10-28 00:00:00.000",
"2017-10-29 00:00:00.000", "2017-10-30 00:00:00.000"), class = "factor"),
CustomerName = structure(c(1L, 11L, 12L, 13L, 14L, 15L, 16L,
17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 1L, 11L, 12L,
13L, 14L, 15L, 16L, 17L, 18L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L), .Label = c("x1", "x10", "x11", "x12", "x13", "x14",
"x15", "x16", "x17", "x18", "x2", "x3", "x4", "x5", "x6",
"x7", "x8", "x9"), class = "factor"), ItemRelation = c(13322L,
13322L, 13322L, 13322L, 13322L, 13322L, 13322L, 11706L, 13322L,
11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13322L,
11706L, 13322L, 11706L, 13322L, 11706L, 13322L, 11706L, 13163L,
13322L, 158010L, 11706L, 13163L, 13322L, 158010L, 11706L,
13163L, 13322L, 158010L, 11706L), SaleCount = c(10L, 3L,
1L, 0L, 9L, 5L, 5L, 11L, 7L, 0L, 5L, 11L, 1L, 0L, 0L, 19L,
10L, 0L, 1L, 12L, 1L, 11L, 6L, 0L, 167L, 7L, 0L, 16L, 165L,
1L, 0L, 0L, 29L, 0L, 0L, 11L), DocumentNum = c(36L, 36L,
36L, 36L, 36L, 36L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L,
36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 36L, 51L, 131L, 36L,
89L, 51L, 131L, 36L, 89L, 51L, 131L, 36L, 89L, 51L), DocumentYear = c(2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L,
2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L, 2017L),
IsPromo = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), .Names = c("Dt", "CustomerName",
"ItemRelation", "SaleCount", "DocumentNum", "DocumentYear", "IsPromo"
), class = "data.frame", row.names = c(NA, -36L))
Consider saving the max DT (retrieved before regression that drops field) in a log file at the end of your scheduled script, then add a log read-in at beginning of script for the last logged date to include in WHERE clause:
# READ DATE FROM LOG FILE
log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
# QUERY WITH WHERE CLAUSE
sql <- paste0("SELECT Dt, CustomerName, ItemRelation, SaleCount,
DocumentNum, DocumentYear, IsPromo
FROM dbo.mytable WHERE Dt > '", log_dt, "'")
df <- sqlQuery(dbHandle, sql)
# RETRIEVE MAX DATE VALUE
max_DT <- as.character(max(df$Dt))
# ... regression
# WRITE DATE TO LOG FILE
cat(max_DT, file="/path/to/SQL_MaxDate.txt")
Better yet, use parameterization with RODBCext to avoid string concatenation and quoting:
library(RODBC)
library(RODBCext)
# READ DATE FROM LOG FILE
log_dt <- readLines("/path/to/SQL_MaxDate.txt", warn=FALSE)
dbHandle <- odbcDriverConnect(...)
# PREPARED STATEMENT WITH PLACEHOLDER
sql <- "SELECT Dt, CustomerName, ItemRelation, SaleCount,
DocumentNum, DocumentYear, IsPromo
FROM dbo.mytable WHERE Dt > ?")
# EXECUTE QUERY BINDING PARAM VALUE
df <- sqlExecute(dbHandle, sql, log_dt, fetch=TRUE)
# RETRIEVE MAX DATE VALUE
max_DT <- as.character(max(df$Dt))
# ... regression
# WRITE DATE TO LOG FILE
cat(max_DT, file="/path/to/SQL_MaxDate.txt")

Data manipulation: spread columns with different number of rows in dplyr

I am trying to spread the time columns of my dataframe. left_join would be my choice, but the age groups age and geo differ, thus I end up with most years containing NA values and one of the age categories disappears.
library(dplyr)
dt %>%
filter(time!=2001) %>%
group_by(time, geo, age, sex) %>%
filter(time==2011) %>%
left_join(.,dt %>%
group_by(time, sex, age, geo) %>%
mutate(time2 = 2011) %>%
filter(time != 2011) %>%
spread(time, value),
by = c('time' = 'time2', 'age', 'geo'))
What I obtain is this:
time geo sex.x age value sex.y `2000` `2001` `2002` `2003`
2011 51900 1 0 27933 1 NA 26193 NA NA
2011 51900 1 0 27933 2 NA 22760 NA NA
2011 51900 1 5 20627 1 NA 26213 NA NA
2011 51900 1 5 20627 2 NA 25647 NA NA
...
2011 51900 1 75 6400 1 NA 5313 NA NA
2011 51900 1 75 6400 2 NA 11500 NA NA
2011 51900 1 80 4520 NA NA NA NA NA
but there's a problem with the ```value`` column as it repeats the same values twice (and it shouldn't) and years 2000, 2002, ..., 2020
What I would like is this:
geo sex age 2001 2011 2000 2002 2003 ... 2020
51900 1 0 39290 41900 69844 55281 55545 58045
51900 2 0 34140 38270 61192 65301 65429 65391
51902 1 0 4307 4193 69844 55281 55545 58045
51902 2 0 3753 3453 61192 65301 65429 65391
...
51900 1 80 NA 41900 104766 97952 98143 87068
51900 2 80 NA 38270 91788 89921 83317 98086
dt = structure(list(time = c(2001L, 2001L, 2001L, 2001L, 2001L, 2001L,
2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2001L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L,
2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2011L, 2000L, 2000L, 2000L, 2000L, 2000L, 2002L,
2002L, 2002L, 2002L, 2002L, 2003L, 2003L, 2003L, 2003L, 2003L, 2004L, 2004L, 2004L, 2004L, 2004L, 2005L, 2005L, 2005L, 2005L,
2005L, 2006L, 2006L, 2006L, 2006L, 2006L, 2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L, 2008L, 2008L, 2008L, 2009L, 2009L,
2009L, 2009L, 2009L, 2010L, 2010L, 2010L, 2010L, 2010L, 2012L, 2012L, 2012L, 2012L, 2012L, 2013L, 2013L, 2013L, 2013L, 2013L,
2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 2015L, 2015L, 2015L, 2016L, 2016L, 2016L, 2016L, 2016L, 2017L, 2017L, 2017L,
2017L, 2017L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L, 2019L, 2019L, 2020L, 2020L, 2020L, 2020L, 2020L, 2000L,
2000L, 2000L, 2000L, 2000L, 2002L, 2002L, 2002L, 2002L, 2002L, 2003L, 2003L, 2003L, 2003L, 2003L, 2004L, 2004L, 2004L, 2004L,
2004L, 2005L, 2005L, 2005L, 2005L, 2005L, 2006L, 2006L, 2006L, 2006L, 2006L, 2007L, 2007L, 2007L, 2007L, 2007L, 2008L, 2008L,
2008L, 2008L, 2008L, 2009L, 2009L, 2009L, 2009L, 2009L, 2010L, 2010L, 2010L, 2010L, 2010L, 2012L, 2012L, 2012L, 2012L, 2012L,
2013L, 2013L, 2013L, 2013L, 2013L, 2014L, 2014L, 2014L, 2014L, 2014L, 2015L, 2015L, 2015L, 2015L, 2015L, 2016L, 2016L, 2016L,
2016L, 2016L, 2017L, 2017L, 2017L, 2017L, 2017L, 2018L, 2018L, 2018L, 2018L, 2018L, 2019L, 2019L, 2019L, 2019L, 2019L, 2020L,
2020L, 2020L, 2020L, 2020L), geo = c(51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51902L, 51902L, 51902L,
51902L, 51902L, 51902L, 51902L, 51902L, 51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51900L, 51902L,
51902L, 51902L, 51902L, 51902L, 51902L, 51902L, 51902L, 51902L, 51902L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L,
51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L, 51L), sex = c(1L,
1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L), age = c(0L, 5L, 10L, 75L, 0L, 5L, 10L, 75L, 0L, 5L, 10L, 75L, 0L, 5L, 10L, 75L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L,
80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L,
0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L, 0L, 5L,
10L, 75L, 80L, 0L, 5L, 10L, 75L, 80L), value = c(26193L, 26213L, 31653L, 5313L, 22760L, 25647L, 31393L, 11500L, 4307L, 4793L,
5947L, 667L, 3753L, 4500L, 5207L, 1440L, 27933L, 20627L, 20593L, 6400L, 4520L, 25513L, 17480L, 17800L, 9520L, 8560L, 4193L, 3027L,
3453L, 800L, 580L, 3453L, 2473L, 2980L, 1013L, 1167L, 61192L, 88249L, 105509L, 20595L, 18198L, 55281L, 76667L, 99967L, 25571L,
19187L, 55545L, 70490L, 95697L, 28376L, 19340L, 56564L, 64639L, 90809L, 30322L, 19579L, 57471L, 59755L, 85464L, 30949L, 20081L,
60145L, 55926L, 79537L, 30083L, 22373L, 61425L, 53664L, 73329L, 27916L, 24891L, 61683L, 52992L, 67148L, 25620L, 27118L, 61776L,
53403L, 61637L, 24601L, 28551L, 62477L, 53990L, 57438L, 25439L, 29074L, 64401L, 56247L, 52992L, 31317L, 30495L, 64691L, 58095L,
52582L, 35069L, 30691L, 64689L, 60083L, 52853L, 37023L, 31297L, 64391L, 61877L, 53538L, 36327L, 32537L, 63158L, 63367L, 54657L,
33260L, 35359L, 61961L, 64311L, 56249L, 28203L, 38591L, 60751L, 64639L, 58159L, 22742L, 41433L, 59469L, 64485L, 60081L, 18813L,
42936L, 58045L, 64127L, 61703L, 17280L, 42758L, 69844L, 93632L, 109773L, 11025L, 7397L, 65301L, 82373L, 103304L, 16130L, 7705L,
65429L, 77025L, 98764L, 18861L, 7835L, 66195L, 72123L, 93892L, 20763L, 8231L, 66949L, 68002L, 88909L, 21513L, 8973L, 69257L,
64759L, 83202L, 21269L, 10813L, 70402L, 62813L, 77601L, 20044L, 12820L, 70681L, 62125L, 72404L, 18627L, 14631L, 70818L, 62321L,
68099L, 17947L, 15893L, 71579L, 62729L, 65085L, 18379L, 16509L, 73653L, 64712L, 61851L, 21697L, 17861L, 73764L, 66737L, 61483L,
23663L, 18103L, 73537L, 68968L, 61599L, 24347L, 18455L, 73041L, 70867L, 62190L, 23305L, 18986L, 71645L, 72368L, 63235L, 21077L,
20717L, 70201L, 73275L, 64867L, 17653L, 22534L, 68704L, 73517L,
66893L, 14089L, 23935L, 67117L, 73238L, 68928L, 11606L, 24343L, 65391L, 72725L, 70609L, 10697L, 23592L)), .Names = c("time",
"geo", "sex", "age", "value"), class = "data.frame", row.names = c(NA, -226L))
You can use the spread function from tidyr
dt_final <- dt %>% spread (time, # the variable I want to use to create multiple columns
value)# the variable to use to fill the rows in the new columns
head(as.tibble(dt_final))
# geo sex age `2000` `2001` `2002` `2003` `2004` `2005` `2006` `2007` `2008` `2009` `2010` `2011` `2012` `2013` `2014` `2015` `2016` `2017` `2018` `2019` `2020`
# <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
# 1 51 1 0 69844 NA 65301 65429 66195 66949 69257 70402 70681 70818 71579 NA 73653 73764 73537 73041 71645 70201 68704 67117 65391
# 2 51 1 5 93632 NA 82373 77025 72123 68002 64759 62813 62125 62321 62729 NA 64712 66737 68968 70867 72368 73275 73517 73238 72725
# 3 51 1 10 109773 NA 103304 98764 93892 88909 83202 77601 72404 68099 65085 NA 61851 61483 61599 62190 63235 64867 66893 68928 70609
# 4 51 1 75 11025 NA 16130 18861 20763 21513 21269 20044 18627 17947 18379 NA 21697 23663 24347 23305 21077 17653 14089 11606 10697
# 5 51 1 80 7397 NA 7705 7835 8231 8973 10813 12820 14631 15893 16509 NA 17861 18103 18455 18986 20717 22534 23935 24343 23592
# 6 51 2 0 61192 NA 55281 55545 56564 57471 60145 61425 61683 61776 62477 NA 64401 64691 64689 64391 63158 61961 60751 59469 58045

Resources