Mix values from dataframes with different formats - r

I have a database with the columns: "Year", "Month", "T1",......"T31":
For example df_0 is the original format and I want to convert it in the new_df (second part)
id0 <- c ("Year", "Month", "T_day1", "T_day2", "T_day3", "T_day4", "T_day5")
id1 <- c ("2010", "January", 10, 5, 2,3,3)
id2 <- c ("2010", "February", 20,36,5,8,1)
id3 <- c ("2010", "March", 12,23,23,5,25)
df_0 <- rbind (id1, id2, id3)
colnames (df_0)<- id0
head(df_0)
I would like to create a new dataframe in which the data from T1....T31 for each month and year will join to a column with all dates for example from 1st January 2010 to 4th January 2012:
date<-seq(as.Date("2010-01-01"), as.Date("2012-01-04"), by="days")
or join the value in a new column of a dataframe based on the values of other three columns (year, month and day):
year <- lapply(strsplit(as.character(date), "\\-"), "[", 1)
month <- lapply(strsplit(as.character(date), "\\-"), "[", 2)
day <- lapply(strsplit(as.character(date), "\\-"), "[", 3)
df <- cbind (year, month, day)
I would like to have a data frame with the information in this way:
Year <- rep(2010,15)
Month <- c(rep("January", 5), rep("February",5), rep("March",5))
Day<- rep(c(1,2,3,4,5))
Value <- c(10,5,2,3,3,20,36,5,8,1,12,23,23,5,25)
new_df <- cbind (Year, Month, Day, Value)
head(new_df)
Thanks in advance

What you're looking for is to reshape your data. One library which you can use is the reshape2 library. Here we can use the melt function in the reshape2 library:
melt(data.frame(df_0), id.vars=c("Year", "Month"))
Based on the data you have, the output would have:
Year Month variable value
1 2010 January T_day1 10
2 2010 February T_day1 20
3 2010 March T_day1 12
4 2010 January T_day2 5
5 2010 February T_day2 36
6 2010 March T_day2 23
7 2010 January T_day3 2
8 2010 February T_day3 5
9 2010 March T_day3 23
10 2010 January T_day4 3
11 2010 February T_day4 8
12 2010 March T_day4 5
13 2010 January T_day5 3
14 2010 February T_day5 1
15 2010 March T_day5 25
Which you can then alter the variable column to the days depending on how you have formatted that column.

Firstly, I generated my own test data. I used a reduced date vector for easier demonstration: 2010-01-01 to 2010-03-04. In my df_0 I generated a value for each date in my reduced date vector not including the last date, and including one additional date not in my date vector: 2010-03-05. It will become clear later why I did this.
set.seed(1);
date <- seq(as.Date('2010-01-01'),as.Date('2010-03-04'),by='day');
df_0 <- reshape(setNames(as.data.frame(cbind(do.call(rbind,strsplit(strftime(c(date[-length(date)],as.Date('2010-03-05')),'%Y %B %d'),' ')),round(rnorm(length(date)),3))),c('Year','Month','Day','T_day')),dir='w',idvar=c('Year','Month'),timevar='Day');
attr(df_0,'reshapeWide') <- NULL;
df_0;
## Year Month T_day.01 T_day.02 T_day.03 T_day.04 T_day.05 T_day.06 T_day.07 T_day.08 T_day.09 T_day.10 T_day.11 T_day.12 T_day.13 T_day.14 T_day.15 T_day.16 T_day.17 T_day.18 T_day.19 T_day.20 T_day.21 T_day.22 T_day.23 T_day.24 T_day.25 T_day.26 T_day.27 T_day.28 T_day.29 T_day.30 T_day.31
## 1 2010 January -0.626 0.184 -0.836 1.595 0.33 -0.82 0.487 0.738 0.576 -0.305 1.512 0.39 -0.621 -2.215 1.125 -0.045 -0.016 0.944 0.821 0.594 0.919 0.782 0.075 -1.989 0.62 -0.056 -0.156 -1.471 -0.478 0.418 1.359
## 32 2010 February -0.103 0.388 -0.054 -1.377 -0.415 -0.394 -0.059 1.1 0.763 -0.165 -0.253 0.697 0.557 -0.689 -0.707 0.365 0.769 -0.112 0.881 0.398 -0.612 0.341 -1.129 1.433 1.98 -0.367 -1.044 0.57 <NA> <NA> <NA>
## 60 2010 March -0.135 2.402 -0.039 <NA> 0.69 <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA> <NA>
The first half of the solution is a reshaping from wide format to long, and can be done with a single call to reshape(). Additionally, I wrapped it in a call to na.omit() to prevent NA values from being generated from the unavoidable NA cells in df_0:
df_1 <- na.omit(reshape(df_0,dir='l',idvar=c('Year','Month'),timevar='Day',varying=grep('^T_day\\.',names(df_0)),v.names='Value'));
rownames(df_1) <- NULL;
df_1[order(match(df_1$Month,month.name),df_1$Day),];
## Year Month Day Value
## 1 2010 January 1 -0.626
## 4 2010 January 2 0.184
## 7 2010 January 3 -0.836
## 10 2010 January 4 1.595
## 12 2010 January 5 0.33
## 15 2010 January 6 -0.82
## 17 2010 January 7 0.487
## 19 2010 January 8 0.738
## 21 2010 January 9 0.576
## 23 2010 January 10 -0.305
## 25 2010 January 11 1.512
## 27 2010 January 12 0.39
## 29 2010 January 13 -0.621
## 31 2010 January 14 -2.215
## 33 2010 January 15 1.125
## 35 2010 January 16 -0.045
## 37 2010 January 17 -0.016
## 39 2010 January 18 0.944
## 41 2010 January 19 0.821
## 43 2010 January 20 0.594
## 45 2010 January 21 0.919
## 47 2010 January 22 0.782
## 49 2010 January 23 0.075
## 51 2010 January 24 -1.989
## 53 2010 January 25 0.62
## 55 2010 January 26 -0.056
## 57 2010 January 27 -0.156
## 59 2010 January 28 -1.471
## 61 2010 January 29 -0.478
## 62 2010 January 30 0.418
## 63 2010 January 31 1.359
## 2 2010 February 1 -0.103
## 5 2010 February 2 0.388
## 8 2010 February 3 -0.054
## 11 2010 February 4 -1.377
## 13 2010 February 5 -0.415
## 16 2010 February 6 -0.394
## 18 2010 February 7 -0.059
## 20 2010 February 8 1.1
## 22 2010 February 9 0.763
## 24 2010 February 10 -0.165
## 26 2010 February 11 -0.253
## 28 2010 February 12 0.697
## 30 2010 February 13 0.557
## 32 2010 February 14 -0.689
## 34 2010 February 15 -0.707
## 36 2010 February 16 0.365
## 38 2010 February 17 0.769
## 40 2010 February 18 -0.112
## 42 2010 February 19 0.881
## 44 2010 February 20 0.398
## 46 2010 February 21 -0.612
## 48 2010 February 22 0.341
## 50 2010 February 23 -1.129
## 52 2010 February 24 1.433
## 54 2010 February 25 1.98
## 56 2010 February 26 -0.367
## 58 2010 February 27 -1.044
## 60 2010 February 28 0.57
## 3 2010 March 1 -0.135
## 6 2010 March 2 2.402
## 9 2010 March 3 -0.039
## 14 2010 March 5 0.69
The second part of the solution requires merging the above long-format data.frame with the exact dates you stated you want in the resulting data.frame. This requires a fair amount of scaffolding code to transform the date vector into a data.frame with Year Month Day columns, but once that's done, you can simply call merge() with all.x=T to preserve every date in the date vector whether or not it was present in df_1, and to exclude any date in df_1 that is not also present in the date vector:
df_2 <- merge(transform(setNames(as.data.frame(do.call(rbind,strsplit(strftime(date,'%Y %B %d'),' '))),c('Year','Month','Day')),Day=as.integer(Day)),df_1,all.x=T);
df_2[order(match(df_2$Month,month.name),df_2$Day),];
## Year Month Day Value
## 29 2010 January 1 -0.626
## 30 2010 January 2 0.184
## 31 2010 January 3 -0.836
## 32 2010 January 4 1.595
## 33 2010 January 5 0.33
## 34 2010 January 6 -0.82
## 35 2010 January 7 0.487
## 36 2010 January 8 0.738
## 37 2010 January 9 0.576
## 38 2010 January 10 -0.305
## 39 2010 January 11 1.512
## 40 2010 January 12 0.39
## 41 2010 January 13 -0.621
## 42 2010 January 14 -2.215
## 43 2010 January 15 1.125
## 44 2010 January 16 -0.045
## 45 2010 January 17 -0.016
## 46 2010 January 18 0.944
## 47 2010 January 19 0.821
## 48 2010 January 20 0.594
## 49 2010 January 21 0.919
## 50 2010 January 22 0.782
## 51 2010 January 23 0.075
## 52 2010 January 24 -1.989
## 53 2010 January 25 0.62
## 54 2010 January 26 -0.056
## 55 2010 January 27 -0.156
## 56 2010 January 28 -1.471
## 57 2010 January 29 -0.478
## 58 2010 January 30 0.418
## 59 2010 January 31 1.359
## 1 2010 February 1 -0.103
## 2 2010 February 2 0.388
## 3 2010 February 3 -0.054
## 4 2010 February 4 -1.377
## 5 2010 February 5 -0.415
## 6 2010 February 6 -0.394
## 7 2010 February 7 -0.059
## 8 2010 February 8 1.1
## 9 2010 February 9 0.763
## 10 2010 February 10 -0.165
## 11 2010 February 11 -0.253
## 12 2010 February 12 0.697
## 13 2010 February 13 0.557
## 14 2010 February 14 -0.689
## 15 2010 February 15 -0.707
## 16 2010 February 16 0.365
## 17 2010 February 17 0.769
## 18 2010 February 18 -0.112
## 19 2010 February 19 0.881
## 20 2010 February 20 0.398
## 21 2010 February 21 -0.612
## 22 2010 February 22 0.341
## 23 2010 February 23 -1.129
## 24 2010 February 24 1.433
## 25 2010 February 25 1.98
## 26 2010 February 26 -0.367
## 27 2010 February 27 -1.044
## 28 2010 February 28 0.57
## 60 2010 March 1 -0.135
## 61 2010 March 2 2.402
## 62 2010 March 3 -0.039
## 63 2010 March 4 <NA>
Notice how 2010-03-04 is included, even though I didn't generate a value for it in df_0, and 2010-03-05 is excluded, even though I did.

Related

Forecasting one step ahead

I have one data.frame with three columns Year, Nominal_Revenue and COEFFICIENT. So I want to forecast with this data like example below
library(dplyr)
TEST<-data.frame(
Year= c(2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021),
Nominal_Revenue=c(8634,5798,6011,6002,6166,6478,6731,7114,6956,6968,7098,7610,7642,8203,9856,10328,11364,12211,13150,NA,NA,NA),
COEFFICIENT=c(NA,1.016,1.026,1.042,1.049,1.106,1.092,1.123,1.121,0.999,1.059,1.066,1.006,1.081,1.055,1.063,1.071,1.04,1.072,1.062,1.07, 1.075))
SIMULATION<-mutate(TEST,
FORECAST=lag(Nominal_Revenue)*COEFFICIENT
)
And results from this code is like picture below, or in other words this code calculate forecasting only for one year or more precisely 2019.
So my intention is get results only for NA in column Nominal_Revenue,like picture below.
So can anybody help me how to fix this code ?
Because each time you need the previously computed value, we can loop for the number of NAs in your variable and apply a dplyr
for (i in 1:length(which(is.na(TEST$Nominal_Revenue)))){
TEST=TEST%>%mutate(Nominal_Revenue=if_else(is.na(Nominal_Revenue),COEFFICIENT*lag(Nominal_Revenue),Nominal_Revenue))
}
> TEST
Year Nominal_Revenue COEFFICIENT
1 2000 8634.00 NA
2 2001 5798.00 1.016
3 2002 6011.00 1.026
4 2003 6002.00 1.042
5 2004 6166.00 1.049
6 2005 6478.00 1.106
7 2006 6731.00 1.092
8 2007 7114.00 1.123
9 2008 6956.00 1.121
10 2009 6968.00 0.999
11 2010 7098.00 1.059
12 2011 7610.00 1.066
13 2012 7642.00 1.006
14 2013 8203.00 1.081
15 2014 9856.00 1.055
16 2015 10328.00 1.063
17 2016 11364.00 1.071
18 2017 12211.00 1.040
19 2018 13150.00 1.072
20 2019 13965.30 1.062
21 2020 14942.87 1.070
22 2021 16063.59 1.075

vis.gam transparent at top of "persp" graph

I am running a GAMM using package mgcv. The model is running fine and gives an output that makes sense, but when I use vis.gam(plot.type="persp") my graph appears like this:
enter image description here
Why is this happening? When I use vis.gam(plot.type="contour") there is no area which is transparent.
It appears to not simply be a problem with the heat color pallete; the same thing happens when I change the color scheme of the "persp" plot:
persp plot, "topo" colour
The contour plot is completely filled while the persp plot is still transparent at the top.
Data:
logcpue assnage distkm fsamplingyr
1 -1.5218399 7 3.490 2015
2 -1.6863990 4 3.490 2012
3 -1.4534337 6 3.490 2014
4 -1.5207723 5 3.490 2013
5 -2.4061258 2 3.490 2010
6 -2.5427262 3 3.490 2011
7 -1.6177367 3 3.313 1998
8 -4.4067192 10 3.313 2005
9 -4.3438054 11 3.313 2006
10 -2.8834031 7 3.313 2002
11 -2.3182512 2 3.313 1997
12 -4.1108738 1 3.235 2010
13 -2.0149030 3 3.235 2012
14 -1.4900912 6 3.235 2015
15 -3.7954892 2 3.235 2011
16 -1.6499840 4 3.235 2013
17 -1.9924302 5 3.235 2014
18 -1.2122716 4 3.189 1998
19 -0.6675703 3 3.189 1997
20 -4.7957905 7 3.106 1998
21 -3.8763958 6 3.106 1997
22 -1.2205021 4 3.073 2010
23 -1.9262374 7 3.073 2013
24 -3.3463891 9 3.073 2015
25 -1.7805862 2 3.073 2008
26 -3.2451931 8 3.073 2014
27 -1.4441139 5 3.073 2011
28 -1.4395389 6 3.073 2012
29 -1.6357552 4 2.876 2014
30 -1.3449091 5 2.876 2015
31 -2.3782225 3 2.876 2013
32 -4.4886364 1 2.876 2011
33 -2.6026897 2 2.876 2012
34 -3.5765503 1 2.147 2002
35 -4.8040211 9 2.147 2010
36 -1.3993664 5 2.147 2006
37 -1.2712250 4 2.147 2005
38 -1.8495790 7 2.147 2008
39 -2.5073795 1 2.034 2012
40 -2.0654553 4 2.034 2015
41 -3.6309855 2 2.034 2013
42 -2.2643639 3 2.034 2014
43 -2.2643639 6 1.452 2006
44 -3.3900241 8 1.452 2008
45 -4.9628446 2 1.452 2002
46 -2.0088240 5 1.452 2005
47 -3.9186675 1 1.323 2013
48 -4.3438054 2 1.323 2014
49 -3.5695327 3 1.323 2015
50 -1.6986690 7 1.200 2005
51 -3.2451931 8 1.200 2006
52 -0.9024016 4 1.200 2002
library(mgcv)
f1 <- formula(logcpue ~ s(assnage)+distkm)
m1 <- gamm(f1,random = list(fsamplingyr =~ 1),
method = "REML",
data =ycsnew)
vis.gam(m1$gam,color="topo",plot.type = "persp",theta=180)
vis.gam(m1$gam,color="heat",plot.type = "persp",theta=180)
vis.gam(m1$gam,view=c("assnage","distkm"),
plot.type="contour",color="heat",las=1)
vis.gam(m1$gam,view=c("assnage","distkm"),
plot.type="contour",color="terrain",las=1,contour.col="black")
The code of vis.gam has this:
surf.col[surf.col > max.z * 2] <- NA
I am unable to understand what it is doing and it appears to be rather ad_hoc. NA values of colors are generally transparent. If you comment out that line (and assign the environment of the new function as:
environment(vis.gam2) <- environment(vis.gam)
.... you get complete coloring of the surface.

Merging complementary rows of a dataframe with R

I have such a data frame
0 weekday day month year hour basal bolus carb period.h
1 Tuesday 01 03 2016 0.0 0.25 NA NA 0
2 Tuesday 01 03 2016 10.9 NA NA 67 10
3 Tuesday 01 03 2016 10.9 NA 4.15 NA 10
4 Tuesday 01 03 2016 12.0 0.30 NA NA 12
5 Tuesday 01 03 2016 17.0 0.50 NA NA 17
6 Tuesday 01 03 2016 17.6 NA NA 33 17
7 Tuesday 01 03 2016 17.6 NA 1.35 NA 17
8 Tuesday 01 03 2016 18.6 NA NA 44 18
9 Tuesday 01 03 2016 18.6 NA 1.80 NA 18
10 Tuesday 01 03 2016 18.9 NA NA 17 18
11 Tuesday 01 03 2016 18.9 NA 0.70 NA 18
12 Tuesday 01 03 2016 22.0 0.40 NA NA 22
13 Wednesday 02 03 2016 0.0 0.25 NA NA 0
14 Wednesday 02 03 2016 9.7 NA NA 39 9
15 Wednesday 02 03 2016 9.7 NA 2.65 NA 9
16 Wednesday 02 03 2016 11.2 NA NA 13 11
17 Wednesday 02 03 2016 11.2 NA 0.30 NA 11
18 Wednesday 02 03 2016 12.0 0.30 NA NA 12
19 Wednesday 02 03 2016 12.0 NA NA 16 12
20 Wednesday 02 03 2016 12.0 NA 0.65 NA 12
If you look at the lines 2 and 3, you notice that they correspond exactly to the same day & time: just for the line #2 the "carb" is not NA, and the "bolus" is not NA (These are data about diabete).
I want to merge such lines into a single one:
2 Tuesday 01 03 2016 10.9 NA NA 67 10
3 Tuesday 01 03 2016 10.9 NA 4.15 NA 10
->
2 Tuesday 01 03 2016 10.9 NA 4.15 67 10
I could of course do a brutal double loop over each line, but I look for a cleverer and faster way.
You can group your data frame by the common identifier columns weekday, day, month, year, hour, period.h here and then sort and take the first element from the remaining columns which you would like to merge, sort() function by default will remove NAs in the vector to be sorted and thus you will end up with non-NA elements for each column within each group; if all elements in a column are NA, sort(col)[1] returns NA:
library(dplyr)
df %>%
group_by(weekday, day, month, year, hour, period.h) %>%
summarise_all(funs(sort(.)[1]))
# weekday day month year hour period.h basal bolus carb
# <fctr> <int> <int> <int> <dbl> <int> <dbl> <dbl> <int>
# 1 Tuesday 1 3 2016 0.0 0 0.25 NA NA
# 2 Tuesday 1 3 2016 10.9 10 NA 4.15 67
# 3 Tuesday 1 3 2016 12.0 12 0.30 NA NA
# 4 Tuesday 1 3 2016 17.0 17 0.50 NA NA
# 5 Tuesday 1 3 2016 17.6 17 NA 1.35 33
# 6 Tuesday 1 3 2016 18.6 18 NA 1.80 44
# 7 Tuesday 1 3 2016 18.9 18 NA 0.70 17
# 8 Tuesday 1 3 2016 22.0 22 0.40 NA NA
# 9 Wednesday 2 3 2016 0.0 0 0.25 NA NA
# 10 Wednesday 2 3 2016 9.7 9 NA 2.65 39
# 11 Wednesday 2 3 2016 11.2 11 NA 0.30 13
# 12 Wednesday 2 3 2016 12.0 12 0.30 0.65 16
Instead of sort(), maybe a more appropriate function to use here is na.omit():
df %>% group_by(weekday, day, month, year, hour, period.h) %>%
summarise_all(funs(na.omit(.)[1]))

Linear model/lmList with nested/multiple group categories?

I am trying to build a model for monthly energy consumption based on weather, grouped by location (there are ~1100) AND year (I would like to do it from 2011-2014). The data is called factin and looks like this:
Store Month Days UPD HD CD Year
1 August, 2013 31 6478.27 0.06 10.03 2013
1 September, 2013 30 6015.38 0.50 5.67 2013
1 October, 2013 31 5478.21 5.29 1.48 2013
1 November, 2013 30 5223.78 18.60 0.00 2013
1 December, 2013 31 5115.80 20.52 0.23 2013
6 January, 2011 31 4517.56 27.45 0.00 2011
6 February, 2011 28 4116.07 16.75 0.07 2011
6 March, 2011 31 3981.78 12.68 0.39 2011
6 April, 2011 30 4041.68 3.83 2.53 2011
6 May, 2011 31 4287.23 1.61 6.58 2011
And my model code, which just spits out 1 set of coefficients for all the years of each store, looks like this:
factout <- lmList(UPD ~ HD + CD | Store, factin)
My question is: is there any way I can get coefficients for each store AND year without creating a separate data frame for each year?
dat <- read.table(header = T, stringsAsFactors = F, text = "Store Month year Days UPD HD CD Year
1 August 2013 31 6478.27 0.06 10.03 2013
1 September 2013 30 6015.38 0.50 5.67 2013
1 October 2013 31 5478.21 5.29 1.48 2013
1 November 2013 30 5223.78 18.60 0.00 2013
1 December 2013 31 5115.80 20.52 0.23 2013
6 January 2011 31 4517.56 27.45 0.00 2011
6 February 2011 28 4116.07 16.75 0.07 2011
6 March 2011 31 3981.78 12.68 0.39 2011
6 April 2011 30 4041.68 3.83 2.53 2011
6 May 2011 31 4287.23 1.61 6.58 2011")
factout <- lmList(UPD ~ HD + CD | Store, dat)
data.frame(Store = unique(dat$Store), summary(factout)$coef[1:2,1,1:3])
(Intercept) HD CD
1 5405.108 -12.90986 107.2061
6 3581.307 32.93137 102.9780

How to save the WindRose plot result as summary table?

library("openair")
library("plyr")
I have the following the line code and I need to summarize the result from the WindRose plot in compass values (i.e. N=1000,NE-6859,W=9585,etc). The function is based on the package openair. I do not know how to extract from the plot, the frecuency values for each wind direction paddle.
windRose(viento,"WS_ms_Avg","WindDir",type="mes", paddle=FALSE,layout=c(3,4))
Is this what you're looking for?
tmp_1 <- windRose(mydata)
head(tmp_1$data)
## default Interval1 Interval2 Interval3 Interval4
## 1 01 January 1998 to 23 June 2005 0.9918888 4.560834 6.322132 6.838161
## 2 01 January 1998 to 23 June 2005 0.4665894 2.679027 4.344535 5.113944
## 3 01 January 1998 to 23 June 2005 0.4264195 2.479722 4.339900 4.956354
## 4 01 January 1998 to 23 June 2005 0.5561993 2.399382 3.589031 4.018540
## 5 01 January 1998 to 23 June 2005 0.6056392 2.717652 4.105060 4.863654
## 6 01 January 1998 to 23 June 2005 0.7122441 3.797605 7.341831 10.246427
## wd calm panel.fun mean.wd freqs
## 1 30 0.1 4.49 -118.3 4426
## 2 60 0.1 4.49 -118.3 3310
## 3 90 0.1 4.49 -118.3 3208
## 4 120 0.1 4.49 -118.3 2601
## 5 150 0.1 4.49 -118.3 3148
## 6 180 0.1 4.49 -118.3 6632
tmp_2 <- windRose(mydata, type="year")
head(tmp_2$data)
## year Interval1 Interval2 Interval3 Interval4 wd calm panel.fun mean.wd
## 1 1998 1.5962554 6.204993 7.057129 7.093135 30 0.2 4.38 -110.4
## 2 1998 0.5160826 3.300528 4.176668 4.200672 60 0.2 4.38 -110.4
## 3 1998 0.2760442 1.500240 2.472396 2.652424 90 0.2 4.38 -110.4
## 4 1998 0.5160826 2.244359 3.192511 3.480557 120 0.2 4.38 -110.4
## 5 1998 0.7561210 2.568411 3.552568 3.972636 150 0.2 4.38 -110.4
## 6 1998 0.9481517 3.984638 7.333173 9.841575 180 0.2 4.38 -110.4
## freqs
## 1 591
## 2 350
## 3 221
## 4 290
## 5 331
## 6 820

Resources