Calculation with apply - r

I have one table with five columns Year,Revenue,Pensions,Income and Wages.With this table I made calculation with code below:
library(dplyr)
#DATA
TEST<-data.frame(
Year= c(2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021),
Revenue =c(8634,5798,6022,6002,6266,6478,6732,7224,6956,6968,7098,7620,7642,8203,9856,20328,22364,22222,23250,25250,26250,27250),
Pensions =c(8734,5798,7011,7002,7177,7478,7731,7114,7957,7978,7098,7710,7742,8203,9857,10328,11374,12211,13150,15150,17150,17150),
Income =c(8834,5898,6033,6002,6366,6488,6833,8334,6956,6968,8098,8630,8642,8203,9856,30328,33364,32233,33350,35350,36350,38350),
Wages =c(8834,5598,8044,8002,8488,8458,8534,5444,8958,8988,5098,5840,5842,8203,9858,40328,44384,42244,43450,45450,48450,45450)
)
#FUNCTION
fun1 <- function(x){ ((x - lag(x))/lag(x))*100}
#CALCULATION
ESTIMATION_0<-mutate(TEST,
Nominal_growth_Revenue=fun1(Revenue),
Nominal_growth_Pensions=fun1(Pensions),
Nominal_growth_Income=fun1(Income),
Nominal_growth_Wages=fun1(Wages)
)
But my intention is to optimize this code and to do this calculation with apply function (or something similar). Namely for this calculation I wrote 4 code line, but I like to do this with one code line. So can anybody help me with this problem ?

Assuming you have a character vector with the relevant columns:
cols <- c("Revenue", "Pensions", "Income", "Wages")
Use apply():
TEST[paste0('nomial_growth', cols)] <- apply(TEST[cols], 2, fun1)
or data.table:
library(data.table)
setDT(TEST)
TEST[, (paste0('nomial_growth', cols)) := lapply(.SD, fun1), .SDcols = cols]

You could do this:
vars_names <- paste0("Nominal_groth", names(select(TEST, -Year)))
TEST %>%
bind_cols( (TEST %>% mutate_at(vars(-Year), ~fun1(.x))) %>% select(-Year) %>% set_names(vars_names) )
Year Revenue Pensions Income Wages Nominal_grothRevenue Nominal_grothPensions Nominal_grothIncome Nominal_grothWages
1 2000 8634 8734 8834 8834 NA NA NA NA
2 2001 5798 5798 5898 5598 -32.8468844 -33.6157545 -33.2352275 -36.63119765
3 2002 6022 7011 6033 8044 3.8634012 20.9210072 2.2889115 43.69417649
4 2003 6002 7002 6002 8002 -0.3321156 -0.1283697 -0.5138405 -0.52212829
5 2004 6266 7177 6366 8488 4.3985338 2.4992859 6.0646451 6.07348163
6 2005 6478 7478 6488 8458 3.3833387 4.1939529 1.9164310 -0.35344015
7 2006 6732 7731 6833 8534 3.9209633 3.3832576 5.3175092 0.89855758
8 2007 7224 7114 8334 5444 7.3083779 -7.9808563 21.9669252 -36.20810874
9 2008 6956 7957 6956 8958 -3.7098560 11.8498735 -16.5346772 64.54812638
10 2009 6968 7978 6968 8988 0.1725129 0.2639186 0.1725129 0.33489618
11 2010 7098 7098 8098 5098 1.8656716 -11.0303334 16.2169920 -43.27992879
12 2011 7620 7710 8630 5840 7.3541843 8.6221471 6.5695233 14.55472734
13 2012 7642 7742 8642 5842 0.2887139 0.4150454 0.1390498 0.03424658
14 2013 8203 8203 8203 8203 7.3410102 5.9545337 -5.0798426 40.41424170
15 2014 9856 9857 9856 9858 20.1511642 20.1633549 20.1511642 20.17554553
16 2015 20328 10328 30328 40328 106.2500000 4.7783301 207.7110390 309.08906472
17 2016 22364 11374 33364 44384 10.0157418 10.1278079 10.0105513 10.05752827
18 2017 22222 12211 32233 42244 -0.6349490 7.3588887 -3.3898813 -4.82155732
19 2018 23250 13150 33350 43450 4.6260463 7.6897879 3.4653926 2.85484329
20 2019 25250 15150 35350 45450 8.6021505 15.2091255 5.9970015 4.60299194
21 2020 26250 17150 36350 48450 3.9603960 13.2013201 2.8288543 6.60066007
22 2021 27250 17150 38350 45450 3.8095238 0.0000000 5.5020633 -6.19195046

Related

How to apply 10-year average filter in R on a dataframe?

How can I run a 10-year average filter on the NBP on this dataframe?
This is the head of dataframe
> head(df3)
Year NBP
1 1850 35.454343
2 1851 4.5634543
3 1852 112.389182
4 1853 151.169251
5 1854 73.123145
6 1855 -72.309647
In reality I have years from 1850 to 2100, how can I apply 10-year average filter on the NBP on this dataframe for the variable NBP and plot it temporally?
One option would be using slider package function slide_dbl() that allows you to create rolling variables. Here the code:
library(slider)
library(dplyr)
set.seed(123)
#Data
df <- data.frame(Year=1990:2020,NBP=rnorm(31,2,0.5))
# Rolling by group
df %>%
mutate(rollingNBP = slide_dbl(NBP, mean, .before = 9, .complete = T))
Output:
Year NBP rollingNBP
1 1990 1.8399718 NA
2 1991 1.3442388 NA
3 1992 1.7001958 NA
4 1993 1.9352947 NA
5 1994 2.4433681 NA
6 1995 1.9243020 NA
7 1996 2.1648956 NA
8 1997 0.3863386 NA
9 1998 1.6141041 NA
10 1999 2.1432743 1.749598
11 2000 1.3897440 1.704576
12 2001 2.2172752 1.791879
13 2002 2.4000884 1.861868
14 2003 1.9180345 1.860142
15 2004 2.6214594 1.877952
16 2005 1.5328075 1.838802
17 2006 2.1968543 1.841998
18 2007 2.2018157 2.023546
19 2008 1.5567816 2.017813
20 2009 1.3405312 1.937539
21 2010 2.0144220 2.000007
22 2011 1.7839351 1.956673
23 2012 2.8449363 2.001158
24 2013 2.6141964 2.070774
25 2014 2.1380117 2.022429
26 2015 1.4755122 2.016700
27 2016 1.7395653 1.970971
28 2017 2.8116013 2.031949
29 2018 1.4649659 2.022768
30 2019 2.8429436 2.173009
31 2020 1.8791551 2.159482
If you want to include a plot, you can use ggplot2:
library(ggplot2)
#Code2
df %>%
mutate(rollingNBP = slide_dbl(NBP, mean, .before = 9, .complete = T)) %>%
ggplot(aes(x=Year,y=rollingNBP))+
geom_line()
Output:
And if you want to see both series, try this:
library(tidyr)
#Code 3
df %>%
mutate(rollingNBP = slide_dbl(NBP, mean, .before = 9, .complete = F)) %>%
pivot_longer(-Year) %>%
ggplot(aes(x=Year,y=value,group=name,color=name))+
geom_line()
Output:
An option with rollmean from zoo
library(dplyr)
library(zoo)
df %>%
mutate(rollingNBP = rollmeanr(NBP, k = 10, fill = NA))

How can I change row and column indexes of a dataframe in R?

I have a dataframe in R which has three columns Product_Name(name of books), Year and Units (number of units sold in that year) which looks like this:
Product_Name Year Units
A Modest Proposal 2011 10000
A Modest Proposal 2012 11000
A Modest Proposal 2013 12000
A Modest Proposal 2014 13000
Animal Farm 2011 8000
Animal Farm 2012 9000
Animal Farm 2013 11000
Animal Farm 2014 15000
Catch 22 2011 1000
Catch 22 2012 2000
Catch 22 2013 3000
Catch 22 2014 4000
....
I intend to make a R Shiny dashboard with that where I want to keep the year as a drop-down menu option, for which I wanted to have the dataframe in the following format
A Modest Proposal Animal Farm Catch 22
2011 10000 8000 1000
2012 11000 9000 2000
2013 12000 11000 3000
2014 13000 15000 4000
or the other way round where the Product Names are row indexes and Years are column indexes, either way goes.
How can I do this in R?
Your general issue is transforming long data to wide data. For this, you can use data.table's dcast function (amongst many others):
dt = data.table(
Name = c(rep('A', 4), rep('B', 4), rep('C', 4)),
Year = c(rep(2011:2014, 3)),
Units = rnorm(12)
)
> dt
Name Year Units
1: A 2011 -0.26861318
2: A 2012 0.27194732
3: A 2013 -0.39331361
4: A 2014 0.58200101
5: B 2011 0.09885381
6: B 2012 -0.13786098
7: B 2013 0.03778400
8: B 2014 0.02576433
9: C 2011 -0.86682584
10: C 2012 -1.34319590
11: C 2013 0.10012673
12: C 2014 -0.42956207
> dcast(dt, Year ~ Name, value.var = 'Units')
Year A B C
1: 2011 -0.2686132 0.09885381 -0.8668258
2: 2012 0.2719473 -0.13786098 -1.3431959
3: 2013 -0.3933136 0.03778400 0.1001267
4: 2014 0.5820010 0.02576433 -0.4295621
For the next time, it is easier if you provide a reproducible example, so that the people assisting you do not have to manually recreate your data structure :)
You need to use pivot_wider from tidyr package. I assumed your data is saved in df and you also need dplyr package for %>% (piping)
library(tidyr)
library(dplyr)
df %>%
pivot_wider(names_from = Product_Name, values_from = Units)
Assuming that your dataframe is ordered by Product_Name and by year, I will generate artificial data similar to your datafrme, try this:
Col_1 <- sort(rep(LETTERS[1:3], 4))
Col_2 <- rep(2011:2014, 3)
# artificial data
resp <- ceiling(rnorm(12, 5000, 500))
uu <- data.frame(Col_1, Col_2, resp)
uu
# output is
Col_1 Col_2 resp
1 A 2011 5297
2 A 2012 4963
3 A 2013 4369
4 A 2014 4278
5 B 2011 4721
6 B 2012 5021
7 B 2013 4118
8 B 2014 5262
9 C 2011 4601
10 C 2012 5013
11 C 2013 5707
12 C 2014 5637
>
> # Here starts
> output <- aggregate(uu$resp, list(uu$Col_1), function(x) {x})
> output
Group.1 x.1 x.2 x.3 x.4
1 A 5297 4963 4369 4278
2 B 4721 5021 4118 5262
3 C 4601 5013 5707 5637
>
output2 <- output [, -1]
colnames(output2) <- levels(as.factor(uu$Col_2))
rownames(output2) <- levels(as.factor(uu$Col_1))
# transpose the matrix
> t(output2)
A B C
2011 5297 4721 4601
2012 4963 5021 5013
2013 4369 4118 5707
2014 4278 5262 5637
> # or convert to data.frame
> as.data.frame(t(output2))
A B C
2011 5297 4721 4601
2012 4963 5021 5013
2013 4369 4118 5707
2014 4278 5262 5637

Looping a Function over Groups and Years

I have a sample of many countries across several years that contains information on output (GDP). I would like to calculate the "Output Gap" using a function I found at R-Bloggers here, but would like it to loop over all the countries in my sample taking the years into consideration, with results being stored in a matrix (binding across rows).
The function looks as follows:
hp <- function(data,l=1600){
#h-p filter code from Farnsworth
hpfilterq <- function(x=data,lambda=l){
eye <- diag(length(x))
result <- solve(eye+lambda*crossprod(diff(eye,lag=1,d=2)),x)
return(result)
}
hpfiltered<-hpfilterq(data)
hpgap <- data - hpfiltered
#
t1<-1:length(data)
t2<-t1^2
t3<-t1^3
t1<-ts(t1)
t2<-ts(t2)
t3<-ts(t3)
#
datats<-ts(data)
myseries<-ts.union(datats,t1,t2,t3)
#
polynomial1 <- lm(datats ~ t1,data=myseries)
polynomial2 <- lm(datats ~ t1 + t2,data=myseries)
polynomial3 <- lm(datats ~ t1 + t2 + t3,data=myseries)
#
returndata<-data.frame(hpgap,polynomial1$residuals,polynomial2$residuals,polynomial3$residuals)
colnames(returndata) <- c("H-P Gap", "Poly1","Poly2","Poly3")
return(returndata)
}
My sample hypothetically looks like:
country year output
1 AUS 2000 49709.21
2 AUS 2001 59805.90
3 AUS 2002 46501.57
4 AUS 2003 53521.78
5 AUS 2004 53824.41
6 AUS 2005 55001.43
7 AUS 2006 48356.12
8 AUS 2007 55125.00
9 AUS 2008 58551.84
10 AUS 2009 57805.95
11 AUS 2010 64858.86
12 AUS 2011 67395.81
13 AUS 2012 69043.00
14 AUS 2013 73789.00
15 AUS 2014 77869.09
16 BEL 2000 7110.00
17 BEL 2001 7235.10
18 BEL 2002 7204.10
19 BEL 2003 7327.60
20 BEL 2004 7558.70
21 BEL 2005 7123.10
22 BEL 2006 7539.00
23 BEL 2007 7943.40
24 BEL 2008 8052.50
25 BEL 2009 7509.60
26 BEL 2010 8455.50
27 BEL 2011 8749.40
28 BEL 2012 9694.10
29 BEL 2013 9614.40
30 BEL 2014 8707.50
I would like to apply function "hp" to "AUS",
H-P Gap Poly1 Poly2 Poly3
1 2393.2324 2751.8407 -3684.68922 -3536.276248
2 10838.8666 11069.4941 7391.47697 7412.678826
3 -4118.0018 -4013.8596 -5357.75042 -5414.832333
4 1239.8878 1227.3108 1793.15960 1698.566717
5 -135.2596 -249.0938 1802.10804 1702.622415
6 -657.9475 -851.1054 2261.06288 2181.148204
7 -9031.2784 -9275.4400 -5526.69186 -5570.726477
8 -4024.7279 -4285.5977 -324.65619 -324.656186
9 -2394.8369 -2637.7883 1110.95990 1154.994518
10 -4970.1209 -5162.7119 -2050.54360 -1970.628924
11 224.6324 111.1625 2162.36431 2261.849928
12 881.4586 869.0828 1434.93163 1529.524516
13 633.1949 737.2392 -606.65163 -549.569720
14 3474.5127 3704.2066 26.18952 4.987662
15 5646.3875 6005.2600 -431.26992 -579.682899
"BEL",
H-P Gap Poly1 Poly2 Poly3
1 291.55895 311.04333 -4.1179412 -188.99755
2 253.89032 266.24190 86.1497479 59.73838
3 59.93946 65.34048 -0.4624273 70.64511
4 19.96547 18.93905 46.6455333 164.48089
5 86.68999 80.13762 180.5736296 304.50392
6 -514.57784 -525.36381 -372.9781383 -273.42758
7 -266.08305 -279.36524 -95.8097705 -40.95538
8 -30.94904 -44.86667 149.0787330 149.07873
9 -92.93293 -105.66810 77.8873723 23.03298
10 -808.67250 -818.46952 -666.0838526 -765.63441
11 -37.24743 -42.47095 57.9650582 -65.96523
12 81.16801 81.52762 109.2341047 -8.60125
13 850.02282 856.32619 790.5232870 719.41575
14 594.71530 606.72476 426.6326050 453.04398
15 -487.48754 -470.07667 -785.2379412 -600.35833
and all the other countries, and store them in a matrix (or a list of some sort):
H-P Gap Poly1 Poly2 Poly3
1 2393.23236 2751.84068 -3684.6892235 -3536.276248
2 10838.86665 11069.49406 7391.4769723 7412.678826
3 -4118.00184 -4013.85956 -5357.7504192 -5414.832333
4 1239.88784 1227.31082 1793.1596021 1698.566717
5 -135.25961 -249.09380 1802.1080361 1702.622415
6 -657.94746 -851.10542 2261.0628828 2181.148204
7 -9031.27839 -9275.44005 -5526.6918577 -5570.726477
8 -4024.72789 -4285.59767 -324.6561855 -324.656186
9 -2394.83690 -2637.78829 1110.9598994 1154.994518
10 -4970.12091 -5162.71191 -2050.5436029 -1970.628924
11 224.63238 111.16247 2162.3643075 2261.849928
12 881.45859 869.08285 1434.9316306 1529.524516
13 633.19493 737.23923 -606.6516335 -549.569720
14 3474.51272 3704.20660 26.1895151 4.987662
15 5646.38752 6005.25998 -431.2699235 -579.682899
16 291.55895 311.04333 -4.1179412 -188.997549
17 253.89032 266.24190 86.1497479 59.738375
18 59.93946 65.34048 -0.4624273 70.645114
19 19.96547 18.93905 46.6455333 164.480888
20 86.68999 80.13762 180.5736296 304.503916
21 -514.57784 -525.36381 -372.9781383 -273.427580
22 -266.08305 -279.36524 -95.8097705 -40.955381
23 -30.94904 -44.86667 149.0787330 149.078733
24 -92.93293 -105.66810 77.8873723 23.032983
25 -808.67250 -818.46952 -666.0838526 -765.634411
26 -37.24743 -42.47095 57.9650582 -65.965228
27 81.16801 81.52762 109.2341047 -8.601250
28 850.02282 856.32619 790.5232870 719.415746
29 594.71530 606.72476 426.6326050 453.043978
30 -487.48754 -470.07667 -785.2379412 -600.358333
without having to use the very inefficient: hp(data$output[c(1:15)]) and hp(data$output[c(16:30)])
You can combine split and lapply to apply the hp function in a new function and this will automate export of results as a list of data.frames by country
hp_per_country <- function(x) {
data_list <- split(x$output, x$country)
result <- lapply(data_list, FUN = hp)
} #note that `return` is assumed in R for the last object computed by a function
overall_hp_results <- hp_per_country(input_data)
This assumes that input_data is ordered by year, if that's important to calculation.

Sum over a column and remove duplicates simultaneously [duplicate]

I have a sample dataframe "data" as follows:
X Y Month Year income
2281205 228120 3 2011 1000
2281212 228121 9 2010 1100
2281213 228121 12 2010 900
2281214 228121 3 2011 9000
2281222 228122 6 2010 1111
2281223 228122 9 2010 3000
2281224 228122 12 2010 1889
2281225 228122 3 2011 778
2281243 228124 12 2010 1111
2281244 228124 3 2011 200
2281282 228128 9 2010 7889
2281283 228128 12 2010 2900
2281284 228128 3 2011 3400
2281302 228130 9 2010 1200
2281303 228130 12 2010 2000
2281304 228130 3 2011 1900
2281352 228135 9 2010 2300
2281353 228135 12 2010 1333
2281354 228135 3 2011 2340
I want to use the ddply to compute the income for each Y(not X), if I have four observations for each Y (for example for 2281223 with months 6,9,12 of 2010 and month 3 of 2011). If I have less than four observations (for example for Y =228130), I want to simply ignore it. I use the following commands in R for the above purpose:
require(plyr)
# the data are in the data csv file
data<-read.csv("data.csv")
# convert Y (integers) into factors
y<-as.factor(y)
# get the count of each unique Y
count<-ddply(data,.(Y), summarize, freq=length(Y))
# get the sum of each unique Y
sum<-ddply(data,.(Y),summarize,tot=sum(income))
# show the sum if number of observations for each Y is less than 4
colbind<-cbind(count,sum)
finalsum<-subset(colbind,freq>3)
My output are as follows:
>colbind
Y freq Y tot
1 228120 1 228120 1000
2 228121 3 228121 11000
3 228122 4 228122 6778
4 228124 2 228124 1311
5 228128 3 228128 14189
6 228130 3 228130 5100
7 228135 3 228135 5973
>finalsum
Y freq Y.1 tot
3 228122 4 228122 6778
The above code works, but requires many steps. So,I would like to know whether there is a simple way of performing the above task (using the plyr package).
As pointed out in a comment, you can do multiple operations inside the summarize.
This reduces your code to one line of ddply() and one line of subsetting, which is easy enough with the [ operator:
x <- ddply(data, .(Y), summarize, freq=length(Y), tot=sum(income))
x[x$freq > 3, ]
Y freq tot
3 228122 4 6778
This is also exceptionally easy with the data.table package:
library(data.table)
data.table(data)[, list(freq=length(income), tot=sum(income)), by=Y][freq > 3]
Y freq tot
1: 228122 4 6778
In fact, the operation to calculate the length of a vector has its own shortcut in data.table - use the .N shortcut:
data.table(data)[, list(freq=.N, tot=sum(income)), by=Y][freq > 3]
Y freq tot
1: 228122 4 6778
I think the package dplyr is faster than plyr::ddply and more elegant.
testData <- read.table(file = "clipboard",header = TRUE)
require(dplyr)
testData %>%
group_by(Y) %>%
summarise(total = sum(income),freq = n()) %>%
filter(freq > 3)

ddply for sum by group in R

I have a sample dataframe "data" as follows:
X Y Month Year income
2281205 228120 3 2011 1000
2281212 228121 9 2010 1100
2281213 228121 12 2010 900
2281214 228121 3 2011 9000
2281222 228122 6 2010 1111
2281223 228122 9 2010 3000
2281224 228122 12 2010 1889
2281225 228122 3 2011 778
2281243 228124 12 2010 1111
2281244 228124 3 2011 200
2281282 228128 9 2010 7889
2281283 228128 12 2010 2900
2281284 228128 3 2011 3400
2281302 228130 9 2010 1200
2281303 228130 12 2010 2000
2281304 228130 3 2011 1900
2281352 228135 9 2010 2300
2281353 228135 12 2010 1333
2281354 228135 3 2011 2340
I want to use the ddply to compute the income for each Y(not X), if I have four observations for each Y (for example for 2281223 with months 6,9,12 of 2010 and month 3 of 2011). If I have less than four observations (for example for Y =228130), I want to simply ignore it. I use the following commands in R for the above purpose:
require(plyr)
# the data are in the data csv file
data<-read.csv("data.csv")
# convert Y (integers) into factors
y<-as.factor(y)
# get the count of each unique Y
count<-ddply(data,.(Y), summarize, freq=length(Y))
# get the sum of each unique Y
sum<-ddply(data,.(Y),summarize,tot=sum(income))
# show the sum if number of observations for each Y is less than 4
colbind<-cbind(count,sum)
finalsum<-subset(colbind,freq>3)
My output are as follows:
>colbind
Y freq Y tot
1 228120 1 228120 1000
2 228121 3 228121 11000
3 228122 4 228122 6778
4 228124 2 228124 1311
5 228128 3 228128 14189
6 228130 3 228130 5100
7 228135 3 228135 5973
>finalsum
Y freq Y.1 tot
3 228122 4 228122 6778
The above code works, but requires many steps. So,I would like to know whether there is a simple way of performing the above task (using the plyr package).
As pointed out in a comment, you can do multiple operations inside the summarize.
This reduces your code to one line of ddply() and one line of subsetting, which is easy enough with the [ operator:
x <- ddply(data, .(Y), summarize, freq=length(Y), tot=sum(income))
x[x$freq > 3, ]
Y freq tot
3 228122 4 6778
This is also exceptionally easy with the data.table package:
library(data.table)
data.table(data)[, list(freq=length(income), tot=sum(income)), by=Y][freq > 3]
Y freq tot
1: 228122 4 6778
In fact, the operation to calculate the length of a vector has its own shortcut in data.table - use the .N shortcut:
data.table(data)[, list(freq=.N, tot=sum(income)), by=Y][freq > 3]
Y freq tot
1: 228122 4 6778
I think the package dplyr is faster than plyr::ddply and more elegant.
testData <- read.table(file = "clipboard",header = TRUE)
require(dplyr)
testData %>%
group_by(Y) %>%
summarise(total = sum(income),freq = n()) %>%
filter(freq > 3)

Resources