I have a dataset below in which I want to do linear regression for each country and state and then cbind the predicted values in the dataset:
Final data frame after adding three more columns:
I have done it for one country and one area but want to do it for each country and area and put the predicted, upper and lower limit values back in the data set by cbind:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
data_1 <- data[(data$country=="US" & data$Area=="G"),]
model <- lm(amount ~ week, data = data_1)
pre <- predict(model,newdata = data_1,interval = "prediction",level = 0.95)
pre
How can I loop this for other combination of country and Area?
...and a Base R solution:
data <- data.frame(country = c("US","US","US","US","US","US","US","US","US","US","UK","UK","UK","UK","UK"),
Area = c("G","G","G","G","G","I","I","I","I","I","A","A","A","A","A"),
week = c(1,2,3,4,5,1,2,3,4,5,1,2,3,4,5),amount = c(12,23,34,32,12,12,34,45,65,45,45,34,23,43,43))
splitVar <- paste0(data$country,"-",data$Area)
dfList <- split(data,splitVar)
result <- do.call(rbind,lapply(dfList,function(x){
model <- lm(amount ~ week, data = x)
cbind(x,predict(model,newdata = x,interval = "prediction",level = 0.95))
}))
result
...the results:
country Area week amount fit lwr upr
UK-A.11 UK A 1 45 36.6 -6.0463638 79.24636
UK-A.12 UK A 2 34 37.1 -1.3409128 75.54091
UK-A.13 UK A 3 23 37.6 0.6671656 74.53283
UK-A.14 UK A 4 43 38.1 -0.3409128 76.54091
UK-A.15 UK A 5 43 38.6 -4.0463638 81.24636
US-G.1 US G 1 12 20.8 -27.6791493 69.27915
US-G.2 US G 2 23 21.7 -21.9985147 65.39851
US-G.3 US G 3 34 22.6 -19.3841749 64.58417
US-G.4 US G 4 32 23.5 -20.1985147 67.19851
US-G.5 US G 5 12 24.4 -24.0791493 72.87915
US-I.6 US I 1 12 20.8 -33.8985900 75.49859
US-I.7 US I 2 34 30.5 -18.8046427 79.80464
US-I.8 US I 3 45 40.2 -7.1703685 87.57037
US-I.9 US I 4 65 49.9 0.5953573 99.20464
US-I.10 US I 5 45 59.6 4.9014100 114.29859
We can also use function augment from package broom to get your desired information:
library(purrr)
library(broom)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(models = map(data, ~ lm(amount ~ week, data = .)),
aug = map(models, ~ augment(.x, interval = "prediction"))) %>%
unnest(aug) %>%
select(country, Area, amount, week, .fitted, .lower, .upper)
# A tibble: 15 x 7
# Groups: country, Area [3]
country Area amount week .fitted .lower .upper
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 US G 12 1 20.8 -27.7 69.3
2 US G 23 2 21.7 -22.0 65.4
3 US G 34 3 22.6 -19.4 64.6
4 US G 32 4 23.5 -20.2 67.2
5 US G 12 5 24.4 -24.1 72.9
6 US I 12 1 20.8 -33.9 75.5
7 US I 34 2 30.5 -18.8 79.8
8 US I 45 3 40.2 -7.17 87.6
9 US I 65 4 49.9 0.595 99.2
10 US I 45 5 59.6 4.90 114.
11 UK A 45 1 36.6 -6.05 79.2
12 UK A 34 2 37.1 -1.34 75.5
13 UK A 23 3 37.6 0.667 74.5
14 UK A 43 4 38.1 -0.341 76.5
15 UK A 43 5 38.6 -4.05 81.2
Here is a tidyverse way to do this for every combination of country and Area.
library(tidyverse)
data %>%
group_by(country, Area) %>%
nest() %>%
mutate(model = map(data, ~ lm(amount ~ week, data = .x)),
result = map2(model, data, ~data.frame(predict(.x, newdata = .y,
interval = "prediction",level = 0.95)))) %>%
ungroup %>%
select(-model) %>%
unnest(c(data, result))
# country Area week amount fit lwr upr
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 US G 1 12 20.8 -27.7 69.3
# 2 US G 2 23 21.7 -22.0 65.4
# 3 US G 3 34 22.6 -19.4 64.6
# 4 US G 4 32 23.5 -20.2 67.2
# 5 US G 5 12 24.4 -24.1 72.9
# 6 US I 1 12 20.8 -33.9 75.5
# 7 US I 2 34 30.5 -18.8 79.8
# 8 US I 3 45 40.2 -7.17 87.6
# 9 US I 4 65 49.9 0.595 99.2
#10 US I 5 45 59.6 4.90 114.
#11 UK A 1 45 36.6 -6.05 79.2
#12 UK A 2 34 37.1 -1.34 75.5
#13 UK A 3 23 37.6 0.667 74.5
#14 UK A 4 43 38.1 -0.341 76.5
#15 UK A 5 43 38.6 -4.05 81.2
And one more:
library(tidyverse)
data %>%
mutate(CountryArea=paste0(country,Area) %>% factor %>% fct_inorder) %>%
split(.$CountryArea) %>%
map(~lm(amount~week, data=.)) %>%
map(predict, interval = "prediction",level = 0.95) %>%
reduce(rbind) %>%
cbind(data, .)
country Area week amount fit lwr upr
1 US G 1 12 20.8 -27.6791493 69.27915
2 US G 2 23 21.7 -21.9985147 65.39851
3 US G 3 34 22.6 -19.3841749 64.58417
4 US G 4 32 23.5 -20.1985147 67.19851
5 US G 5 12 24.4 -24.0791493 72.87915
6 US I 1 12 20.8 -33.8985900 75.49859
7 US I 2 34 30.5 -18.8046427 79.80464
8 US I 3 45 40.2 -7.1703685 87.57037
9 US I 4 65 49.9 0.5953573 99.20464
10 US I 5 45 59.6 4.9014100 114.29859
11 UK A 1 45 36.6 -6.0463638 79.24636
12 UK A 2 34 37.1 -1.3409128 75.54091
13 UK A 3 23 37.6 0.6671656 74.53283
14 UK A 4 43 38.1 -0.3409128 76.54091
15 UK A 5 43 38.6 -4.0463638 81.24636
This question already has answers here:
R Sort strings according to substring
(2 answers)
Closed 2 years ago.
I got the dataframe (code) and I I want to sort it according to combName in a numerical order.
> code
# A tibble: 1,108 x 2
combName sumLength
<chr> <dbl>
1 20-1 8.05
2 20-10 14.7
3 20-100 21.2
4 20-101 17.6
5 20-102 25.4
6 20-103 46.3
7 20-104 68.7
8 20-105 24.3
9 20-106 46.3
10 20-107 14.0
# ... with 1,098 more rows
Afterwards the left column should look like:
> code
# A tibble: 1,108 x 2
combName sumLength
<chr> <dbl>
1 20-1 8.05
2 20-2 ...
3 20-3 ...
4 20-4 ...
5 20-5 ...
...
10 20-10 14.7
# ... with 1,098 more rows
It do not know what I can do to reach this format.
Does this work:
library(dplyr)
library(tidyr)
df
# A tibble: 10 x 2
combName sumLength
<chr> <dbl>
1 20-102 25.4
2 20-100 21.2
3 20-101 17.6
4 20-105 24.3
5 20-10 14.7
6 20-103 46.3
7 20-104 68.7
8 20-1 8.05
9 20-106 46.3
10 20-107 14
df %>% separate(combName, into = c('1','2'), sep = '-', remove = F) %>%
type.convert(as.is = T) %>% arrange(`1`,`2`) %>% select(-c(`1`,`2`))
# A tibble: 10 x 2
combName sumLength
<chr> <dbl>
1 20-1 8.05
2 20-10 14.7
3 20-100 21.2
4 20-101 17.6
5 20-102 25.4
6 20-103 46.3
7 20-104 68.7
8 20-105 24.3
9 20-106 46.3
10 20-107 14
I am trying to apply some basic math to daily stock values based on a corresponding yearly value.
reprex
(daily prices)
library(tidyquant)
data(FANG)
# daily prices
FANG %>%
select(c(date, symbol, adjusted)) %>%
group_by(symbol)
# A tibble: 4,032 x 3
# Groups: symbol [4]
date symbol adjusted
<date> <chr> <dbl>
1 2013-01-02 FB 28
2 2013-01-03 FB 27.8
3 2013-01-04 FB 28.8
4 2013-01-07 FB 29.4
5 2013-01-08 FB 29.1
6 2013-01-09 FB 30.6
7 2013-01-10 FB 31.3
8 2013-01-11 FB 31.7
9 2013-01-14 FB 31.0
10 2013-01-15 FB 30.1
# ... with 4,022 more rows
(max price per year)
FANG_yearly_high <-
FANG %>%
group_by(symbol) %>%
summarise_by_time(
.date_var = date,
.by = "year",
price = AVERAGE(adjusted))
# Groups: symbol [4]
symbol date price
<chr> <date> <dbl>
1 AMZN 2013-01-01 404.
2 AMZN 2014-01-01 407.
3 AMZN 2015-01-01 694.
4 AMZN 2016-01-01 844.
5 FB 2013-01-01 58.0
6 FB 2014-01-01 81.4
7 FB 2015-01-01 109.
8 FB 2016-01-01 133.
9 GOOG 2013-01-01 560.
10 GOOG 2014-01-01 609.
11 GOOG 2015-01-01 777.
12 GOOG 2016-01-01 813.
13 NFLX 2013-01-01 54.4
14 NFLX 2014-01-01 69.2
15 NFLX 2015-01-01 131.
16 NFLX 2016-01-01 128.
I would like to divide each daily price by the corresponding max price for the year.
I tried:
FANG %>%
group_by(symbol) %>%
summarise_by_time(
.date_var = date,
.by = "year",
price = AVERAGE(adjusted) / YEAR(date(MAX(adjusted)))
)
and the get this error:
Error in as.POSIXlt.numeric(x, tz = tz(x)) : 'origin' must be supplied
Any sensible way to accomplish this?
Thank you
summarise_by_time is good if you just want to summarise. But you want to divide a daily price by a max of a period. So you need to use mutate. Below are 2 examples. The first one for daily prices, the second for weekly. You can adjust the weekly version easily to monthly.
library(tidyquant)
library(dplyr)
data(FANG)
# daily prices
FANG %>%
select(c(date, symbol, adjusted)) %>%
group_by(symbol, year = year(date)) %>%
mutate(price_pct = adjusted / max(adjusted))
# A tibble: 4,032 x 5
# Groups: symbol, year [16]
date symbol adjusted year price_pct
<date> <chr> <dbl> <dbl> <dbl>
1 2013-01-02 FB 28 2013 0.483
2 2013-01-03 FB 27.8 2013 0.479
3 2013-01-04 FB 28.8 2013 0.496
4 2013-01-07 FB 29.4 2013 0.508
5 2013-01-08 FB 29.1 2013 0.501
6 2013-01-09 FB 30.6 2013 0.528
7 2013-01-10 FB 31.3 2013 0.540
8 2013-01-11 FB 31.7 2013 0.547
9 2013-01-14 FB 31.0 2013 0.534
10 2013-01-15 FB 30.1 2013 0.519
# ... with 4,022 more rows
Weekly / monthly:
# weekly
FANG %>%
select(c(date, symbol, adjusted)) %>%
group_by(symbol) %>%
tq_transmute(mutate_fun = to.period,
period = "weeks" # change weeks to months for monthly
) %>%
group_by(symbol, year = year(date)) %>%
mutate(price_pct = adjusted / max(adjusted))
# A tibble: 836 x 5
# Groups: symbol, year [16]
symbol date adjusted year price_pct
<chr> <date> <dbl> <dbl> <dbl>
1 FB 2013-01-04 28.8 2013 0.519
2 FB 2013-01-11 31.7 2013 0.572
3 FB 2013-01-18 29.7 2013 0.535
4 FB 2013-01-25 31.5 2013 0.569
5 FB 2013-02-01 29.7 2013 0.536
6 FB 2013-02-08 28.5 2013 0.515
7 FB 2013-02-15 28.3 2013 0.511
8 FB 2013-02-22 27.1 2013 0.489
9 FB 2013-03-01 27.8 2013 0.501
10 FB 2013-03-08 28.0 2013 0.504
# ... with 826 more rows
I would like to filter a dataframe based on its date column. I would like to keep the rows where I have at least 3 consecutive days. I would like to do this as effeciently and quickly as possible, so if someone has a vectorized approached it would be good.
I tried to inspire myself from the following link, but it didn't really go well, as it is a different problem:
How to filter rows based on difference in dates between rows in R?
I tried to do it with a for loop, I managed to put an indicator on the dates who are not consecutive, but it didn't give me the desired result, because it keeps all dates that are in a row even if they are less than 3 in a row.
tf is my dataframe
for(i in 2:(nrow(tf)-1)){
if(tf$Date[i] != tf$Date[i+1] %m+% days(-1)){
if(tf$Date[i] != tf$Date[i-1] %m+% days(1)){
tf$Date[i] = as.Date(0)
}
}
}
The first 22 rows of my dataframe look something like this:
Date RR.x RR.y Y
1 1984-10-20 1 10.8 1984
2 1984-11-04 1 12.5 1984
3 1984-11-05 1 7.0 1984
4 1984-11-09 1 22.9 1984
5 1984-11-10 1 24.4 1984
6 1984-11-11 1 19.0 1984
7 1984-11-13 1 5.9 1984
8 1986-10-15 1 10.3 1986
9 1986-10-16 1 18.1 1986
10 1986-10-17 1 11.3 1986
11 1986-11-17 1 14.1 1986
12 2003-10-17 1 7.8 2003
13 2003-10-25 1 7.6 2003
14 2003-10-26 1 5.0 2003
15 2003-10-27 1 6.6 2003
16 2003-11-15 1 26.4 2003
17 2003-11-20 1 10.0 2003
18 2011-10-29 1 10.0 2011
19 2011-11-04 1 11.4 2011
20 2011-11-21 1 9.8 2011
21 2011-11-22 1 5.6 2011
22 2011-11-23 1 20.4 2011
The result should be:
Date RR.x RR.y Y
4 1984-11-09 1 22.9 1984
5 1984-11-10 1 24.4 1984
6 1984-11-11 1 19.0 1984
8 1986-10-15 1 10.3 1986
9 1986-10-16 1 18.1 1986
10 1986-10-17 1 11.3 1986
13 2003-10-25 1 7.6 2003
14 2003-10-26 1 5.0 2003
15 2003-10-27 1 6.6 2003
20 2011-11-21 1 9.8 2011
21 2011-11-22 1 5.6 2011
22 2011-11-23 1 20.4 2011
One possibility could be:
df %>%
mutate(Date = as.Date(Date, format = "%Y-%m-%d"),
diff = c(0, diff(Date))) %>%
group_by(grp = cumsum(diff > 1 & lead(diff, default = last(diff)) == 1)) %>%
filter(if_else(diff > 1 & lead(diff, default = last(diff)) == 1, 1, diff) == 1) %>%
filter(n() >= 3) %>%
ungroup() %>%
select(-diff, -grp)
Date RR.x RR.y Y
<date> <int> <dbl> <int>
1 1984-11-09 1 22.9 1984
2 1984-11-10 1 24.4 1984
3 1984-11-11 1 19 1984
4 1986-10-15 1 10.3 1986
5 1986-10-16 1 18.1 1986
6 1986-10-17 1 11.3 1986
7 2003-10-25 1 7.6 2003
8 2003-10-26 1 5 2003
9 2003-10-27 1 6.6 2003
10 2011-11-21 1 9.8 2011
11 2011-11-22 1 5.6 2011
12 2011-11-23 1 20.4 2011
Here's a base solution:
DF$Date <- as.Date(DF$Date)
rles <- rle(cumsum(c(1,diff(DF$Date)!=1)))
rles$values <- rles$lengths >= 3
DF[inverse.rle(rles), ]
Date RR.x RR.y Y
4 1984-11-09 1 22.9 1984
5 1984-11-10 1 24.4 1984
6 1984-11-11 1 19.0 1984
8 1986-10-15 1 10.3 1986
9 1986-10-16 1 18.1 1986
10 1986-10-17 1 11.3 1986
13 2003-10-25 1 7.6 2003
14 2003-10-26 1 5.0 2003
15 2003-10-27 1 6.6 2003
20 2011-11-21 1 9.8 2011
21 2011-11-22 1 5.6 2011
22 2011-11-23 1 20.4 2011
Similar approach in dplyr
DF%>%
mutate(Date = as.Date(Date))%>%
add_count(IDs = cumsum(c(1, diff(Date) !=1)))%>%
filter(n >= 3)
# A tibble: 12 x 6
Date RR.x RR.y Y IDs n
<date> <int> <dbl> <int> <dbl> <int>
1 1984-11-09 1 22.9 1984 3 3
2 1984-11-10 1 24.4 1984 3 3
3 1984-11-11 1 19 1984 3 3
4 1986-10-15 1 10.3 1986 5 3
5 1986-10-16 1 18.1 1986 5 3
6 1986-10-17 1 11.3 1986 5 3
7 2003-10-25 1 7.6 2003 8 3
8 2003-10-26 1 5 2003 8 3
9 2003-10-27 1 6.6 2003 8 3
10 2011-11-21 1 9.8 2011 13 3
11 2011-11-22 1 5.6 2011 13 3
12 2011-11-23 1 20.4 2011 13 3
I am trying to summarise this daily time serie of rainfall by groups of 10-day periods within each month and calculate the acummulated rainfall.
library(tidyverse)
(dat <- tibble(
date = seq(as.Date("2016-01-01"), as.Date("2016-12-31"), by=1),
rainfall = rgamma(length(date), shape=2, scale=2)))
Therefore, I will obtain variability in the third group along the year, for instance: in january the third period has 11 days, february 9 days, and so on. This is my try:
library(lubridate)
dat %>%
group_by(decade=floor_date(date, "10 days")) %>%
summarize(acum_rainfall=sum(rainfall),
days = n())
this is the resulting output
# A tibble: 43 x 3
decade acum_rainfall days
<date> <dbl> <int>
1 2016-01-01 48.5 10
2 2016-01-11 39.9 10
3 2016-01-21 36.1 10
4 2016-01-31 1.87 1
5 2016-02-01 50.6 10
6 2016-02-11 32.1 10
7 2016-02-21 22.1 9
8 2016-03-01 45.9 10
9 2016-03-11 30.0 10
10 2016-03-21 42.4 10
# ... with 33 more rows
can someone help me to sum the residuals periods to the third one to obtain always 3 periods within each month? This would be the desired output (pay attention to the row 3):
decade acum_rainfall days
<date> <dbl> <int>
1 2016-01-01 48.5 10
2 2016-01-11 39.9 10
3 2016-01-21 37.97 11
4 2016-02-01 50.6 10
5 2016-02-11 32.1 10
6 2016-02-21 22.1 9
One way to do this is to use if_else to apply floor_date with different arguments depending on the day value of date. If day(date) is <30, use the normal way, if it's >= 30, then use '20 days' to ensure it gets rounded to day 21:
dat %>%
group_by(decade=if_else(day(date) >= 30,
floor_date(date, "20 days"),
floor_date(date, "10 days"))) %>%
summarize(acum_rainfall=sum(rainfall),
days = n())
# A tibble: 36 x 3
decade acum_rainfall days
<date> <dbl> <int>
1 2016-01-01 38.8 10
2 2016-01-11 38.4 10
3 2016-01-21 43.4 11
4 2016-02-01 34.4 10
5 2016-02-11 34.8 10
6 2016-02-21 25.3 9
7 2016-03-01 39.6 10
8 2016-03-11 53.9 10
9 2016-03-21 38.1 11
10 2016-04-01 36.6 10
# … with 26 more rows