Apply function and create new row - r

I have a data.frame as such:
X 1976 1977
1 6.4 6.9
2 6.3 7.0
3 6.1 7.1
4 6.0 7.2
I want to create the following:
Qtr Value
1976.00 6.27
1976.25 ...
And so on...
1977.00 7.0
1977.25 ...
And so on.
EDIT: The output is the average of the first 3 values. My apologies.
Can anybody help me out? Thanks in advance.
Robert

Here's an approach.
Your data frame:
dat <- read.table(text = "X 1976 1977
1 6.4 6.9
2 6.3 7.0
3 6.1 7.1
4 6.0 7.2", header = TRUE, check.names = FALSE)
The commands:
agg <- aggregate(dat[-1], by = list((dat$X - 1) %/% 3), mean)
dat2 <- setNames(stack(agg[-1])[2:1], c("Qtr", "Value"))
dat2$Qtr <- agg[[1]] * 0.25 + as.numeric(as.character(dat2$Qtr))
The result:
dat2
# Qtr Value
# 1 1976.00 6.266667
# 2 1976.25 6.000000
# 3 1977.00 7.000000
# 4 1977.25 7.200000

Try:
ddf = structure(list(X = 1:12, `1976` = c(6.4, 6.3, 6.1, 6, 6, 6.3,
6.1, 6, 6.4, 6.8, 6.6, 6), `1977` = c(6.9, 7, 7.1, 7.2, 7.2,
7.1, 7.2, 7.5, 7.2, 7.6, 7.8, 7.2)), .Names = c("X", "1976",
"1977"), class = "data.frame", row.names = c(NA, -12L))
ddf
X 1976 1977
1 1 6.4 6.9
2 2 6.3 7.0
3 3 6.1 7.1
4 4 6.0 7.2
5 5 6.0 7.2
6 6 6.3 7.1
7 7 6.1 7.2
8 8 6.0 7.5
9 9 6.4 7.2
10 10 6.8 7.6
11 11 6.6 7.8
12 12 6.0 7.2
df2 = data.frame(qtr =numeric(), value=numeric())
rr=1; x=0; new=TRUE
for(cc in 2:3)for(i in 1:4){
if(cc==3 & new){
rr = 1; x=0; new=FALSE;
}
df2[nrow(df2)+1,1] = as.numeric(names(ddf)[cc])+x
x = x+0.25
df2[nrow(df2),2] = mean(ddf[rr:(rr+3),cc])
rr = rr+4
if(rr>12) rr = 1
}
df2
qtr value
1 1976.00 6.20
2 1976.25 6.10
3 1976.50 6.45
4 1976.75 6.20
5 1977.00 7.05
6 1977.25 7.25
7 1977.50 7.45
8 1977.75 7.05

Related

fill NA values with mean of preceding and subsequent values

I'm working with a dataset of weather variables (temperature, precipitation, etc.) that has a few missing values. Because of my specific approach (summing these variables across several days), I need to address NA values in the dataset.
When there is a missing daily value, I'd like to fill that day with a mean value of the previous and following day. The assumption here is that weather values are similar from one day to the next. And yes, I realize this is a big assumption.
I've developed the following:
maxTemp <- c(13.2, 10.7, NA, 17.9, 6.6, 10, 13, NA, NA, 8.8, 9.9, 14.9, 16.3, NA, 18, 9.9, 11.5, 15.3, 21.7, 23.9, 26.6, 27, 22.3, NA, 17.9)
weather <- as.data.frame(maxTemp)
weather %>%
mutate(maxTempNA = if_else(is.na(maxTemp),
(lag(maxTemp) + lead(maxTemp))/2,
maxTemp))
However, in a few cases, I have two NA values on consecutive days, so this doesn't work. Any thoughts on approaches to code this so that when there are two (or more) NA's in a row, the average uses the 'bookending' values to fill the NAs?
The final result would do look like this:
maxTemp <- c(13.2, 10.7, 14.3, 17.9, 6.6, 10, 13, 10.9, 10.9, 8.8, 9.9, 14.9, 16.3, 17.15, 18, 9.9, 11.5, 15.3, 21.7, 23.9, 26.6, 27, 22.3, 20.1, 17.9)
How about using approx to replace NAs with interpolated values; by default, approx uses linear interpolation, so this should match your manual replace-by-mean results.
weather %>%
mutate(maxTemp_interp = approx(1:n(), maxTemp, 1:n())$y)
# maxTemp maxTemp_interp
# 1 13.2 13.20
# 2 10.7 10.70
# 3 NA 14.30
# 4 17.9 17.90
# 5 6.6 6.60
# 6 10.0 10.00
# 7 13.0 13.00
# 8 NA 11.60
# 9 NA 10.20
# 10 8.8 8.80
# 11 9.9 9.90
# 12 14.9 14.90
# 13 16.3 16.30
# 14 NA 17.15
# 15 18.0 18.00
# 16 9.9 9.90
# 17 11.5 11.50
# 18 15.3 15.30
# 19 21.7 21.70
# 20 23.9 23.90
# 21 26.6 26.60
# 22 27.0 27.00
# 23 22.3 22.30
# 24 NA 20.10
# 25 17.9 17.90
I've created a new column here to make it easier to compare with the original data.
Update
Markus pointed out in the comments (thanks #markus) that to reproduce your expected output, you'd actually need method = "constant" with f = 0.5:
weather %>%
mutate(maxTemp_interp = approx(1:n(), maxTemp, 1:n(), method = "constant", f = 0.5)$y)
# maxTemp maxTemp_interp
# 1 13.2 13.20
# 2 10.7 10.70
# 3 NA 14.30
# 4 17.9 17.90
# 5 6.6 6.60
# 6 10.0 10.00
# 7 13.0 13.00
# 8 NA 10.90
# 9 NA 10.90
# 10 8.8 8.80
# 11 9.9 9.90
# 12 14.9 14.90
# 13 16.3 16.30
# 14 NA 17.15
# 15 18.0 18.00
# 16 9.9 9.90
# 17 11.5 11.50
# 18 15.3 15.30
# 19 21.7 21.70
# 20 23.9 23.90
# 21 26.6 26.60
# 22 27.0 27.00
# 23 22.3 22.30
# 24 NA 20.10
# 25 17.9 17.90
If you want to use the mean of the most recent non-NA value going backwards and forwards, you can use something like data.table::nafill() to fill values both down and up, and then take the mean:
weather$prevTemp = data.table::nafill(weather$maxTemp, type = "locf")
weather$nextTemp = data.table::nafill(weather$maxTemp, type = "nocb")
weather$maxTemp[is.na(weather$maxTemp)] = ((weather$prevTemp + weather$nextTemp) / 2)[is.na(weather$maxTemp)]

Aggregate with a start and end of date

I'm new to R so this is maybe simple, but I haven't find how to do it yet.
I'm trying to aggregate my temperature data by day so I have a mean temperature for every day of the year.
Here's an example of my data and the code I made :
Date Qobs Ptot Fsol Temp PE X
1 1956-11-01 0.001 14.0 -99 12.0 1.4 NA
2 1956-11-02 0.001 0.0 -99 13.5 1.5 NA
3 1956-11-03 0.001 0.0 -99 13.5 1.5 NA
4 1956-11-04 0.001 0.0 -99 13.0 1.4 NA
5 1956-11-05 0.001 0.0 -99 11.5 1.3 NA
6 1956-11-06 0.001 0.0 -99 11.0 1.2 NA
7 1956-11-07 0.001 2.0 -99 12.5 1.3 NA
8 1956-11-08 0.000 0.0 -99 5.0 0.7 NA
9 1956-11-09 0.000 0.5 -99 0.0 0.4 NA
10 1956-11-10 0.000 0.0 -99 -2.5 0.2 NA
11 1956-11-11 0.000 2.5 -99 5.5 0.8 NA
12 1956-11-12 0.000 0.0 -99 7.5 0.9 NA
reg_T=aggregate(x=tmp_data$Temp, by=list(j=format(tmp_data$Date, "%j")), mean)
But as you can see my data doesn't start the 1st Januray, so the 1st day of my data is the 01/11 which makes it complicated for later when it's aggregated.
How can I aggregate and define the start at the 01/01 and make it forget the beginning and end of my data because they are not complete years?
Thanks!
dput() of the data:
df <- structure(list(Date = structure(c(-4809, -4808, -4807, -4806, -4805, -4804,
-4803, -4802, -4801, -4800, -4799, -4798, -4797,
-4796, -4795, -4794, -4793, -4792, -4791, -4790,
-4789, -4788, -4787, -4786, -4785, -4784, -4783,
-4782, -4781, -4780), class = "Date"),
Temp = c(12, 13.5, 13.5, 13, 11.5, 11, 12.5, 5, 0, -2.5, 5.5, 7.5,
1.5, 6, 14, 6, 0.5, 0.5, 4, 2, 9, -4.5, -11.5, -10, -4.5,
-2.5, -3.5, -1, -1.5, -7.5)),
.Names = c("Date", "Temp"), row.names = c(NA, 30L), class = "data.frame")
What about something like this:
require(tidyverse)
df %>%
mutate(MonthDay = str_sub(as.character(Date), 6)) %>%
group_by(MonthDay) %>%
summarise(MeanDay = mean(Temp, na.rm = TRUE))
# A tibble: 30 x 2
MonthDay MeanDay
<chr> <dbl>
1 11-01 12.0
2 11-02 13.5
3 11-03 13.5
4 11-04 13.0
5 11-05 11.5
6 11-06 11.0
7 11-07 12.5
8 11-08 5.00
9 11-09 0.
10 11-10 -2.50
# ... with 20 more rows

sales calculation as per the below method

I am have some sales calculation and define some basic predicted sales as per the formula given.
df1: cut_of_sales
cut-off_sales
1
2
1
3
df2: actual df for data:
Sales
NA
NA
NA
NA
1.2
2.1
1.4
1.1
2.1
1.4
1.1
1.2
2.1
1.4
1.1
1.2
2.1
1.4
1.1
2.3
First 4 quarters are NA. Keep them as they are.
Start with 5th row by adding the first value for cutoff_sales
Explanation:
1. cutoff_sales is given predefined by the company, 4 values for each quaters are given.
2. Add the q1 quarter of the cutoff sales with 2010q1 = ansq1
3. Add the q2 quater of the cutoff sales with 2010q2 = ansq2
4. Do the same for q3 and q4.
Now the answer of above addition will, will be input for next 2011 quaters.
so ansq1 + 2012q1 = ans...
ansq2 + 2012q2 = ans ....
and so on for below quarter answer for 2012 quaters will be input for 2013 and so on for rest of the 10 years.
Please help me in doing this addition.
I was only able to do the first year addition.
please help me writting a function or a loop that would be iterative as there would be many years coming up.
thanks.
For updated question
With the updated question, the following is one way to achieve the task. Since this is quarter data and the first four rows are NA, you can add the values of cut_off in mydf1 to Sales first. Then, you create a grouping variable. 1 indicates first quarter. You can sum up Sales with cumsum() as I suggested in my previous answer. It seems that you want to keep the NAs. So I converted 0 to NA in the end.
mydf2$Sales[5:8] <- mydf2$Sales[5:8] + mydf1$cut_off
group_by(mydf2, quarter = rep(1:4, times = n()/4)) %>%
mutate(Sales = cumsum(if_else(is.na(Sales), 0, Sales)),
Sales = na_if(Sales, 0))
Sales quarter
<dbl> <int>
1 NA 1
2 NA 2
3 NA 3
4 NA 4
5 2.20 1
6 4.10 2
7 2.40 3
8 4.10 4
9 4.30 1
10 5.50 2
11 3.50 3
12 5.30 4
13 6.40 1
14 6.90 2
15 4.60 3
16 6.50 4
17 8.50 1
18 8.30 2
19 5.70 3
20 8.80 4
DATA
mydf2 <- structure(list(Sales = c(NA, NA, NA, NA, 2.2, 4.1, 2.4, 4.1,
2.1, 1.4, 1.1, 1.2, 2.1, 1.4, 1.1, 1.2, 2.1, 1.4, 1.1, 2.3)), .Names = "Sales", row.names = c(NA,
-20L), class = "data.frame")
For original question
Here is one approach. I considered cases where you would have NA in your data. First, I added the values of cut_off in mydf1. Then, I create a new variable called quarter and defined groups. For each group, I applied cumsum() and summed up the values. If you do not have any NA, the final line would be mutate(sales = cumsum(sales)) in the code below.
library(dplyr)
mydf2 %>%
mutate(sales = if_else(substr(sales_quarter, 1,4) == "2010", sales + mydf1$cut_off, sales)) %>%
group_by(quarter = substr(sales_quarter, 5, 6)) %>%
mutate(sales = cumsum(if_else(is.na(sales), 0, sales)))
sales_quarter sales quarter
<chr> <dbl> <chr>
1 2010Q1 2.20 Q1
2 2010Q2 4.10 Q2
3 2010Q3 2.40 Q3
4 2010Q4 4.10 Q4
5 2011Q1 4.30 Q1
6 2011Q2 5.50 Q2
7 2011Q3 3.50 Q3
8 2011Q4 5.30 Q4
9 2012Q1 6.40 Q1
10 2012Q2 6.90 Q2
11 2012Q3 4.60 Q3
12 2012Q4 6.50 Q4
13 2013Q1 8.50 Q1
14 2013Q2 8.30 Q2
15 2013Q3 5.70 Q3
16 2013Q4 8.80 Q4
DATA
mydf1 <- structure(list(cut_off = c(1, 2, 1, 3)), .Names = "cut_off", row.names = c(NA,
4L), class = "data.frame")
mydf2 <- structure(list(sales_quarter = c("2010Q1", "2010Q2", "2010Q3",
"2010Q4", "2011Q1", "2011Q2", "2011Q3", "2011Q4", "2012Q1", "2012Q2",
"2012Q3", "2012Q4", "2013Q1", "2013Q2", "2013Q3", "2013Q4"),
sales = c(1.2, 2.1, 1.4, 1.1, 2.1, 1.4, 1.1, 1.2, 2.1, 1.4,
1.1, 1.2, 2.1, 1.4, 1.1, 2.3)), .Names = c("sales_quarter",
"sales"), class = "data.frame", row.names = c(NA, -16L))
New sequential answer:
> df
year_quater sales pred_sales
1 2010Q1 1.2 NA
2 2010Q2 2.1 NA
3 2010Q3 1.4 NA
4 2010Q4 1.1 NA
5 2011Q1 2.1 NA
6 2011Q2 1.4 NA
7 2011Q3 1.1 NA
8 2011Q4 1.2 NA
9 2012Q1 2.1 NA
10 2012Q2 1.4 NA
11 2012Q3 1.1 NA
12 2012Q4 1.2 NA
13 2013Q1 2.1 NA
14 2013Q2 1.4 NA
15 2013Q3 1.1 NA
16 2013Q4 2.3 NA
pred <- c(1,2,1,3)
for(i in seq(1, nrow(df), 4)){
df$pred_sales[i:(i+3)] <- df$sales[i:(i+3)] + pred
pred <- df$pred_sales[i:(i+3)]
}
> df
year_quater sales pred_sales
1 2010Q1 1.2 2.2
2 2010Q2 2.1 4.1
3 2010Q3 1.4 2.4
4 2010Q4 1.1 4.1
5 2011Q1 2.1 4.3
6 2011Q2 1.4 5.5
7 2011Q3 1.1 3.5
8 2011Q4 1.2 5.3
9 2012Q1 2.1 6.4
10 2012Q2 1.4 6.9
11 2012Q3 1.1 4.6
12 2012Q4 1.2 6.5
13 2013Q1 2.1 8.5
14 2013Q2 1.4 8.3
15 2013Q3 1.1 5.7
16 2013Q4 2.3 8.8
This answer creates a variable sequence by using the number of rows of your data and loops through every 4 rows, calculates the pred_sales, updates the pred values to use in the next loop iteration.

Calculate formula over all rows and specific columns of dataframe

I have the following sample dataframe with prices of toys in different shops:
dfData <- data.frame(article = c("Fix", "Foxi", "Stan", "Olli", "Barbie", "Ken", "Hulk"),
priceToys1 = c(10, NA, 10.5, NA, 10.7, 11.2, 12.0),
priceAllToys = c(NA, 11.4, NA, 11.9, 11.7, 11.1, NA),
price123Toys = c(12, 12.4, 12.7, NA, NA, 11.0, 12.1))
Additionally I generate a min price column by adding:
dfData$MinPrice <- apply(dfData[, grep("price", colnames(dfData))], 1, FUN=min, na.rm = TRUE)
So I have this dataframe now:
# article priceToys1 priceAllToys price123Toys MinPrice
#1 Fix 10.0 NA 12.0 10.0
#2 Foxi NA 11.4 12.4 11.4
#3 Stan 10.5 NA 12.7 10.5
#4 Olli NA 11.9 NA 11.9
#5 Barbie 10.7 11.7 NA 10.7
#6 Ken 11.2 11.1 11.0 11.0
#7 Hulk 12.0 NA 12.1 12.0
How do I get additional columns into the dataframe that tell me the factor of all prices relatively to the minimum price in percentage? The new column names should also include the shop name.
The result should look like this:
# article priceToys1 PercToys1 priceAllToys PercAllToys price123Toys Perc123Toys MinPrice
#1 Fix 10.0 100.0 NA NA 12.0 120.0 10.0
#2 Foxi NA NA 11.4 100.0 12.4 108.8 11.4
#3 Stan 10.5 100.0 NA NA 12.7 121.0 10.5
#4 Olli NA NA 11.9 100.0 NA NA 11.9
#5 Barbie 10.7 100.0 11.7 109.4 NA NA 10.7
#6 Ken 11.2 101.8 11.1 100.9 11.0 100.0 11.0
#7 Hulk 12.0 100.0 NA NA 12.1 100.8 12.0
Two possible solutions:
1) With the data.table-package:
# load the 'data.table'-package
library(data.table)
# get the columnnames on which to operate
cols <- names(dfData)[2:4] # or: grep("price", names(dfData), value = TRUE)
# convert dfData to a 'data.table'
setDT(dfData)
# compute the 'fraction'-columns
dfData[, paste0('Perc', gsub('price','',cols)) := lapply(.SD, function(x) round(100 * x / MinPrice, 1))
, .SDcols = cols][]
which gives:
article priceToys1 priceAllToys price123Toys MinPrice PercToys1 PercAllToys Perc123Toys
1: Fix 10.0 NA 12.0 10.0 100.0 NA 120.0
2: Foxi NA 11.4 12.4 11.4 NA 100.0 108.8
3: Stan 10.5 NA 12.7 10.5 100.0 NA 121.0
4: Olli NA 11.9 NA 11.9 NA 100.0 NA
5: Barbie 10.7 11.7 NA 10.7 100.0 109.3 NA
6: Ken 11.2 11.1 11.0 11.0 101.8 100.9 100.0
7: Hulk 12.0 NA 12.1 12.0 100.0 NA 100.8
2) With base R:
cols <- names(dfData)[2:4] # or: grep("price", names(dfData), value = TRUE)
dfData[, paste0('Perc', gsub('price','',cols))] <- round(100 * dfData[, cols] / dfData$MinPrice, 1)
which will get you the same result.
We can use mutate_at from dplyr
library(dplyr)
library(magrittr)
dfData %<>%
mutate_at(vars(matches("^price")), funs(Perc = round(100* ./MinPrice, 1)))
dfData

Calculate row mean with condition in dplyr

My dataset looks like this:
> head(tempExp)
points.id wc2.0_30s_tavg_01 wc2.0_30s_tavg_02
1 AmsterdamGreenhouses_Calamagrostis eigejos-AM_Nhigh 3.1 3.2
2 AmsterdamGreenhouses_Molinia caerulea-AM_Nhigh 3.1 3.2
3 Bangor_Alnus-ECM/AM_Nlow 3.8 3.6
4 Bangor_Betula_pendula-ECM_Nlow 3.8 3.6
5 Bangor_Fagus-ECM_Nlow 3.8 3.6
6 BioCON_nolegumes_mixed-AM_Nlow -11.8 -7.9
wc2.0_30s_tavg_03 wc2.0_30s_tavg_04 wc2.0_30s_tavg_05 wc2.0_30s_tavg_06 wc2.0_30s_tavg_07
1 5.9 8.3 12.6 15.1 17.1
2 5.9 8.3 12.6 15.1 17.1
3 5.4 7.3 10.3 12.7 14.7
4 5.4 7.3 10.3 12.7 14.7
5 5.4 7.3 10.3 12.7 14.7
6 -1.2 7.2 14.5 19.3 21.8
For each row (id) I need to calculate the mean across the entire row, but only including those columns with value > 5.
require(dplyr)
# simulate a similar data set
set.seed(1984)
x <- rep('',100)
for (i in 1:100)
{x[i] <- paste(sample(c(LETTERS, 0:9), 5, replace = T), collapse = '')}
df <- data.frame(ID = x, v1 = 3*rnorm(100),
v2 = 5+3*rnorm(100),
v3 = sample(1:20, 100, replace = T),
v4 = rpois(100,6),
v5 = rep(15,100))
head(df)
# ID v1 v2 v3 v4 v5
#1 XPNL0 7.839162 -1.341105 12 5 15
#2 5BQ3H -1.241025 7.651719 1 5 15
#3 5AZZH 2.185374 2.186604 6 4 15
#4 AKX7H 3.148868 2.513623 13 5 15
#5 VAW42 2.757498 3.888333 16 5 15
#6 F4UST -1.894727 4.587320 2 2 15
df %>%
mutate(avg =apply(df[,-1], 1,
function(x) mean(x[x >5]))) -> df
head(df)
# ID v1 v2 v3 v4 v5 avg
#1 XPNL0 7.839162 -1.341105 12 5 15 11.61305
#2 5BQ3H -1.241025 7.651719 1 5 15 11.32586
#3 5AZZH 2.185374 2.186604 6 4 15 10.50000
#4 AKX7H 3.148868 2.513623 13 5 15 14.00000
#5 VAW42 2.757498 3.888333 16 5 15 15.50000
#6 F4UST -1.894727 4.587320 2 2 15 15.00000

Resources