Accounting with apply not working - r

I am trying to use accounting from the formattable package within apply, and it does not seem to working -
library(formattable)
set.seed(4226)
temp = data.frame(a = sample(1000:50000, 10), b = sample(1000:50000, 10),
c = sample(1000:50000, 10), d = sample(1000:50000, 10))
temp
a b c d
1 45186 17792 43363 17080
2 26982 25410 2327 17982
3 45204 39757 29883 4283
4 27069 21334 10497 28776
5 47895 46241 22743 36257
6 30161 45254 21382 42275
7 18278 28936 27036 23620
8 31199 30182 10235 7355
9 10664 40312 28324 20864
10 45225 45545 44394 13364
apply(temp, 2, function(x){x = accounting(x, digits = 0)})
a b c d
[1,] 45186 17792 43363 17080
[2,] 26982 25410 2327 17982
[3,] 45204 39757 29883 4283
[4,] 27069 21334 10497 28776
[5,] 47895 46241 22743 36257
[6,] 30161 45254 21382 42275
[7,] 18278 28936 27036 23620
[8,] 31199 30182 10235 7355
[9,] 10664 40312 28324 20864
[10,] 45225 45545 44394 13364
What I want is -
a b c d
[1,] 45,186 17,792 43,363 17,080
[2,] 26,982 25,410 2,327 17,982
[3,] 45,204 39,757 29,883 4,283
[4,] 27,069 21,334 10,497 28,776
[5,] 47,895 46,241 22,743 36,257
[6,] 30,161 45,254 21,382 42,275
[7,] 18,278 28,936 27,036 23,620
[8,] 31,199 30,182 10,235 7,355
[9,] 10,664 40,312 28,324 20,864
[10,] 45,225 45,545 44,394 13,364

You probably want to keep things as a data frame, in which case apply is not the right tool. It will always give you a matrix back.
You might want one of the following options:
temp[cols] <- lapply(temp[cols], function(x){accounting(x, digits = 0)})
or
as.data.frame(lapply(temp[cols], function(x){accounting(x, digits = 0)}))
or using dplyr something like:
temp %>%
mutate_at(.vars = cols,.funs = accounting,digits = 0)

Related

Calculate average of lowest values of matrix rows

I have a large matrix, e.g.
> mat = matrix(runif(100), ncol = 5)
> mat
[,1] [,2] [,3] [,4] [,5]
[1,] 0.264442954 0.6408534 0.76472904 0.2437074 0.08019882
[2,] 0.575443586 0.6428957 0.44188123 0.0230842 0.07502289
[3,] 0.894885901 0.5926238 0.55431966 0.7717503 0.52806173
[4,] 0.231978411 0.1192595 0.08170498 0.4264405 0.97486053
[5,] 0.344765840 0.5349323 0.85523617 0.2257759 0.20549035
[6,] 0.499130844 0.9882825 0.99417390 0.8070708 0.29963075
[7,] 0.613479990 0.8877605 0.34282782 0.9525512 0.91488004
[8,] 0.967166001 0.6115709 0.68169111 0.3067973 0.30094691
[9,] 0.957612804 0.5565989 0.88180650 0.3359184 0.17980137
[10,] 0.342177768 0.7735620 0.48154937 0.3692096 0.31299886
[11,] 0.871928110 0.3397143 0.57596030 0.4749349 0.47800019
[12,] 0.387563040 0.1656725 0.47796646 0.8956274 0.68345302
[13,] 0.628535870 0.3418692 0.86513964 0.8052477 0.01850535
[14,] 0.379472842 0.9176644 0.08829197 0.8548662 0.42151935
[15,] 0.071958980 0.6644800 0.90061596 0.4484674 0.32649345
[16,] 0.229463192 0.9995178 0.63995121 0.8369698 0.35091430
[17,] 0.291761976 0.5014815 0.35260028 0.6188047 0.68192891
[18,] 0.077610797 0.2747788 0.07084273 0.5977530 0.37134566
[19,] 0.675912490 0.6059304 0.29321852 0.5638336 0.73866322
[20,] 0.006010715 0.7697045 0.43627939 0.1723969 0.88665973
I want to extract the lowest and highest 2 values of each row and calculate their average.
Eventually, I'd like to generate a new matrix where the first column in the average of the lowest values, and the second column is the average of the highest values.
Thanks in advance!
I believe this does what you want:
do.call(rbind, apply(mat,1, function(x) {sorted = sort(x);
return(data.frame(min=mean(head(sorted,2)), max=mean(tail(sorted,2))))}))
Output:
min max
1 0.14333229 0.8877635
2 0.12311651 0.5283049
3 0.09367614 0.5433373
4 0.39926848 0.6361645
5 0.05196898 0.5473783
6 0.12876148 0.6153546
7 0.29893684 0.8436462
8 0.14254481 0.7023039
9 0.20889814 0.8863141
10 0.44838327 0.8641790
11 0.14859312 0.5533045
12 0.19728414 0.8619284
13 0.37049481 0.7448965
14 0.30070570 0.9320575
15 0.30333510 0.6774024
16 0.21908982 0.7077274
17 0.61804571 0.9239816
18 0.36525615 0.8531795
19 0.22751108 0.4993744
20 0.14251095 0.6353147
Hope this helps!

R Differencing a matrix of stock prices: Delt and apply

I have a data frame where each column is the price history of a stock. I want to create from it a matrix of percentage change in those prices so I am trying to use the Delt function from quantmod. When I try to apply the Delt function using the code below, the result has the incorrect changes in it. When I apply Delt to each column separately, the right changes appear. What am I doing wrong?
here is the data
> library(quantmod)
> m<-read.csv(file="C:/Users/Desktop/prices.csv", header=TRUE, sep=",", row.names=1)
> head(apply(m, 2, Delt, k=1))
A B C D E
[1,] NA NA NA NA NA
[2,] 0.013883218 0.02402696 -0.0028845060 0.10537081 0.065129843
[3,] -0.261075312 -0.27656116 0.0064643504 -0.03748629 0.003693638
[4,] 0.100354272 0.28212656 -0.0006447236 0.01986472 0.022154153
[5,] -0.033372547 0.11482705 -0.0023028136 -0.06562413 0.093813308
[6,] -0.004419677 -0.18058097 0.0007338350 -0.03160490 -0.010180472
> head(Delt(m[,1]))
Delt.1.arithmetic
[1,] NA
[2,] -0.009326258
[3,] 0.015094952
[4,] 0.015403475
[5,] -0.017321925
[6,] 0.019550238
here is the first 30 lines of m:
A B C D E
6/1/2006 186.57 15.16001 12.88992 128.04 63.23111
7/1/2006 184.83 13.93923 12.87614 127.91 63.17918
8/1/2006 187.62 14.40092 12.86571 128.26 63.33122
9/1/2006 190.51 13.87444 12.85661 128.34 63.60919
10/1/2006 187.21 12.84720 12.83384 126.74 62.97229
11/1/2006 190.87 13.52174 12.85892 127.57 64.05329
12/1/2006 198.13 13.81215 12.86256 133.37 64.81722
1/1/2007 195.92 14.24603 12.85711 132.01 65.18905
2/1/2007 196.68 13.95810 12.81181 130.22 65.17205
3/1/2007 195.84 13.74382 12.79951 131.86 65.47931
4/1/2007 196.78 13.76690 12.79787 133.54 65.93696
5/1/2007 199.87 14.22677 12.78429 136.04 65.77217
6/1/2007 198.24 14.10437 12.80410 134.49 65.31252
7/1/2007 200.87 14.19849 12.79181 135.41 65.34666
8/1/2007 203.25 14.08153 12.77221 136.67 65.90654
9/1/2007 201.71 13.99091 12.82413 136.30 65.55228
10/1/2007 204.35 14.62587 12.87034 142.31 67.68648
11/1/2007 208.01 15.15703 12.89241 144.25 68.97027
12/1/2007 205.63 14.69767 12.84175 146.33 69.04170
1/1/2008 198.64 14.58832 12.82002 145.92 69.43480
2/1/2008 196.51 13.58696 12.82676 148.02 70.73637
3/1/2008 198.91 12.77139 12.85116 151.79 71.66404
4/1/2008 197.59 12.52536 12.83977 156.14 72.56894
5/1/2008 197.48 13.14181 12.82874 154.74 73.44301
6/1/2008 198.23 13.14060 12.81345 155.54 73.41066
7/1/2008 199.49 12.67957 12.82051 157.93 73.46999
8/1/2008 197.50 13.85022 12.81246 155.64 72.86505
9/1/2008 180.14 12.89408 12.81132 146.17 70.21979
10/1/2008 176.97 12.13371 12.87266 140.09 69.55070
11/1/2008 160.76 10.22495 12.90323 127.26 67.40361
It may be better to use lapply
lst <- lapply(m, Delt, k=1)
head(data.frame(lst))
#Delt.1.arithmetic Delt.1.arithmetic.1 Delt.1.arithmetic.2 Delt.1.arithmetic.3 Delt.1.arithmetic.4
#1 NA NA NA NA NA
#2 -0.009326258 -0.08052690 -0.0010687188 -0.001015308 -0.0008213299
#3 0.015094952 0.03312212 -0.0008105454 0.002736299 0.0024065873
#4 0.015403475 -0.03655914 -0.0007071123 0.000623733 0.0043890342
#5 -0.017321925 -0.07403839 -0.0017710656 -0.012466885 -0.0100125948
#6 0.019550238 0.05250490 0.0019545514 0.006548840 0.0171662816
Or if we are using apply, convert to vector with as.vector
head(apply(m, 2, function(x) Delt(as.vector(x), k = 1)))
# A B C D E
#[1,] NA NA NA NA NA
#[2,] -0.009326258 -0.08052690 -0.0010687188 -0.001015308 -0.0008213299
#[3,] 0.015094952 0.03312212 -0.0008105454 0.002736299 0.0024065873
#[4,] 0.015403475 -0.03655914 -0.0007071123 0.000623733 0.0043890342
#[5,] -0.017321925 -0.07403839 -0.0017710656 -0.012466885 -0.0100125948
#[6,] 0.019550238 0.05250490 0.0019545514 0.006548840 0.0171662816
unnaming the whole dataset won't give the correct result
head(apply(unname(m), 2, quantmod::Delt))
# [,1] [,2] [,3] [,4] [,5]
#[1,] NA NA NA NA NA
#[2,] 0.013883218 0.02402696 -0.0028845060 0.10537081 0.065129843
#[3,] -0.261075312 -0.27656116 0.0064643504 -0.03748629 0.003693638
#[4,] 0.100354272 0.28212656 -0.0006447236 0.01986472 0.022154153
#[5,] -0.033372547 0.11482705 -0.0023028136 -0.06562413 0.093813308
#[6,] -0.004419677 -0.18058097 0.0007338350 -0.03160490 -0.010180472
However, if we unname the vector, it gives the correct output
head(apply(m, 2, function(x) Delt(unname(x), k=1)))
# A B C D E
#[1,] NA NA NA NA NA
#[2,] -0.009326258 -0.08052690 -0.0010687188 -0.001015308 -0.0008213299
#[3,] 0.015094952 0.03312212 -0.0008105454 0.002736299 0.0024065873
#[4,] 0.015403475 -0.03655914 -0.0007071123 0.000623733 0.0043890342
#[5,] -0.017321925 -0.07403839 -0.0017710656 -0.012466885 -0.0100125948
#[6,] 0.019550238 0.05250490 0.0019545514 0.006548840 0.0171662816
In essence, Delt needs a vector without any rowname attributes. The as.vector and unname within the apply removes those attributes, while lapply remove it by default.
Just to confirm it
head(apply(`row.names<-`(m, NULL), 2, Delt, k=1))
# A B C D E
#[1,] NA NA NA NA NA
#[2,] -0.009326258 -0.08052690 -0.0010687188 -0.001015308 -0.0008213299
#[3,] 0.015094952 0.03312212 -0.0008105454 0.002736299 0.0024065873
#[4,] 0.015403475 -0.03655914 -0.0007071123 0.000623733 0.0043890342
#[5,] -0.017321925 -0.07403839 -0.0017710656 -0.012466885 -0.0100125948
#[6,] 0.019550238 0.05250490 0.0019545514 0.006548840 0.0171662816

Modified rollapply mean

I have a data file which consists of daily xy locations and a logical vector denoting whether or not the location is an outlier. Here is some (poorly created, I know) sample data:
x=seq(3,10,length.out=30)
y=seq(42,45,length.out=30)
outlier=c(F,F,F,F,F,F,F,F,T,T,T,F,F,F,F,F,F,F,F,F,F,T,F,T,F,F,F,F,F,F)
data=cbind(x,y,outlier)
> data
x y outlier
[1,] 3.000000000 42.00000000 0
[2,] 3.241379310 42.10344828 0
[3,] 3.482758621 42.20689655 0
[4,] 3.724137931 42.31034483 0
[5,] 3.965517241 42.41379310 0
[6,] 4.206896552 42.51724138 0
[7,] 4.448275862 42.62068966 0
[8,] 4.689655172 42.72413793 0
[9,] 4.931034483 42.82758621 1
[10,] 5.172413793 42.93103448 1
[11,] 5.413793103 43.03448276 1
[12,] 5.655172414 43.13793103 0
[13,] 5.896551724 43.24137931 0
[14,] 6.137931034 43.34482759 0
[15,] 6.379310345 43.44827586 0
[16,] 6.620689655 43.55172414 0
[17,] 6.862068966 43.65517241 0
[18,] 7.103448276 43.75862069 0
[19,] 7.344827586 43.86206897 0
[20,] 7.586206897 43.96551724 0
[21,] 7.827586207 44.06896552 0
[22,] 8.068965517 44.17241379 1
[23,] 8.310344828 44.27586207 0
[24,] 8.551724138 44.37931034 1
[25,] 8.793103448 44.48275862 0
[26,] 9.034482759 44.58620690 0
[27,] 9.275862069 44.68965517 0
[28,] 9.517241379 44.79310345 0
[29,] 9.758620690 44.89655172 0
[30,] 10.000000000 45.00000000 0
What I need is to take a non-overlapping 6-day mean of the x and y columns. This is easy enough with rollapply(). However, I do not want outlier=1 values to be included in the 6-day mean; nor do I want the 6-day window to 'span' the gap left behind by removing all rows where outlier=T. Instead, I want to make an exception to the 'non-overlapping rule'.
I think this is best explained using the sample data above: the first value should be the mean of rows 1:6, but rather than the second value being the mean of rows 7:12 (including outlier=1 values) or of rows c(7:8,12:15) (skipping over outlier=1 values) I want it to overlap with the first window and take the mean of rows 3:8.
So for the length 30 sample data above, the end result should be of length 5, showing the mean values of rows 1:6, 3:8, 12:17, 16:21 & 25:30 (ideally all values which result from overlapping windows should be labelled as such; i.e. values 1:4 overlap, whereas the final value is unique)
Here is a function that will give you the indices of the endpoints of the averages that you want:
findIndices<-function(outlier,window=6){
r<-rle(outlier)
rends<-cumsum(r$lengths)
segs<-cbind(rends-r$lengths+1,rends)
segs<-segs[with(r,lengths>=window & values==0),]
indices<-unlist(apply(segs,1,function(x) seq(x[1]+window-1,x[2],by=window)))
sort(unique(c(indices,segs[,2])))
}
findIndices(data[,3])
## [1] 6 8 17 21 30
You can then get the averages you want like this:
id<-findIndices(data[,3])
require(zoo)
cbind(index=id,rollmean(data[,1:2],6)[id-5,])
## index x y
## [1,] 6 3.603448 42.25862
## [2,] 8 4.086207 42.46552
## [3,] 17 6.258621 43.39655
## [4,] 21 7.224138 43.81034
## [5,] 30 9.396552 44.74138
You can put it all together in a single function like this:
maWithOutliers<-function(x,outlier,window){
id<-findIndices(outlier,window)
cbind(index=id,rollmean(x,window)[id-window+1,])
}
> maWithOutliers(data[,1:2],data[,3],6)
index x y
[1,] 6 3.603448 42.25862
[2,] 8 4.086207 42.46552
[3,] 17 6.258621 43.39655
[4,] 21 7.224138 43.81034
[5,] 30 9.396552 44.74138
> maWithOutliers(data[,1:2],data[,3],4)
index x y
[1,] 4 3.362069 42.15517
[2,] 8 4.327586 42.56897
[3,] 15 6.017241 43.29310
[4,] 19 6.982759 43.70690
[5,] 21 7.465517 43.91379
[6,] 28 9.155172 44.63793
[7,] 30 9.637931 44.84483
>

R perform grep on input

I have a file with the following contents
3594 -124.049541 44.429077
-123.381222 44.530192
-123.479913 44.625517
-123.578917 44.720704
END
3595 -123.103772 45.009223
-122.427717 45.101578
-122.525757 45.198252
-122.624122 45.294789
END
3676 -122.989567 44.147495
-122.323040 44.238368
-122.419523 44.335217
-122.516322 44.431923
END
END
I'd like to read this file into R, but I'd like to retain only the indented lines.
This seems like a good job for grep, but I'm not sure how to make it work.
Any thoughts?
You could try, where file.txt contains your data:
a <- readLines("file.txt")
a <- a[grepl("^ ", a)]
do.call("rbind", strsplit(a, " "))[, -1]
[,1] [,2]
[1,] "-123.381222" "44.530192"
[2,] "-123.479913" "44.625517"
[3,] "-123.578917" "44.720704"
[4,] "-122.427717" "45.101578"
[5,] "-122.525757" "45.198252"
[6,] "-122.624122" "45.294789"
[7,] "-122.323040" "44.238368"
[8,] "-122.419523" "44.335217"
[9,] "-122.516322" "44.431923"
# Alternatively you can get the data by read.table() as suggested by #Josh
read.table(textConnection(a))
V1 V2
1 -123.3812 44.53019
2 -123.4799 44.62552
3 -123.5789 44.72070
4 -122.4277 45.10158
5 -122.5258 45.19825
6 -122.6241 45.29479
7 -122.3230 44.23837
8 -122.4195 44.33522
9 -122.5163 44.43192

fill gaps in a timeseries with averages

I have a dataframe like so:
day sum_flux samples mean
2005-10-26 0.02 48 0.02
2005-10-27 0.12 12 0.50
It's a series of daily readings spanning 5 years, however some of the days are missing. I want to fill these days with the average of that month from other years.
i.e if 26-10-2005 was missing I'd want to use the average of all Octobers in the data set.
if all of October was missing I'd want to apply this average to each missing day.
I think I need to build a function (possibly using plyr) to evaluate the days. However I'm very inexperienced with using the various timeseries objects in R, and conditionally subsetting data and would like some advice. Especially regarding which type of timeseries I should be using.
Many Thanks
Some sample data. I'm assuming that sum_flux is the column that has missing values, and that you want to calculate values for.
library(lubridate)
days <- seq.POSIXt(ymd("2005-10-26"), ymd("2010-10-26"), by = "1 day")
n_days <- length(days)
readings <- data.frame(
day = days,
sum_flux = runif(n_days),
samples = sample(100, n_days, replace = TRUE),
mean = runif(n_days)
)
readings$sum_flux[sample(n_days, floor(n_days / 10))] <- NA
Add a month column.
readings$month <- month(readings$day, label = TRUE)
Use tapply to get the monthly mean flux.
monthly_avg_flux <- with(readings, tapply(sum_flux, month, mean, na.rm = TRUE))
Use this value whenever the flux is missing, or keep the flux if not.
readings$sum_flux2 <- with(readings, ifelse(
is.na(sum_flux),
monthly_avg_flux[month],
sum_flux
))
This is one (very fast) way in data.table.
Using the nice example data from Richie :
require(data.table)
days <- seq(as.IDate("2005-10-26"), as.IDate("2010-10-26"), by = "1 day")
n_days <- length(days)
readings <- data.table(
day = days,
sum_flux = runif(n_days),
samples = sample(100, n_days, replace = TRUE),
mean = runif(n_days)
)
readings$sum_flux[sample(n_days, floor(n_days / 10))] <- NA
readings
day sum_flux samples mean
[1,] 2005-10-26 0.32838686 94 0.09647325
[2,] 2005-10-27 0.14686591 88 0.48728321
[3,] 2005-10-28 0.25800913 51 0.72776002
[4,] 2005-10-29 0.09628937 81 0.80954124
[5,] 2005-10-30 0.70721591 23 0.60165240
[6,] 2005-10-31 0.59555079 2 0.96849533
[7,] 2005-11-01 NA 42 0.37566491
[8,] 2005-11-02 0.01649860 89 0.48866220
[9,] 2005-11-03 0.46802818 49 0.28920807
[10,] 2005-11-04 0.13024856 30 0.29051080
First 10 rows of 1827 printed.
Create the average for each month, in appearance order of each group :
> avg = readings[,mean(sum_flux,na.rm=TRUE),by=list(mnth = month(day))]
> avg
mnth V1
[1,] 10 0.4915999
[2,] 11 0.5107873
[3,] 12 0.4451787
[4,] 1 0.4966040
[5,] 2 0.4972244
[6,] 3 0.4952821
[7,] 4 0.5106539
[8,] 5 0.4717122
[9,] 6 0.5110490
[10,] 7 0.4507383
[11,] 8 0.4680827
[12,] 9 0.5150618
Next reorder avg to start in January :
avg = avg[order(mnth)]
avg
mnth V1
[1,] 1 0.4966040
[2,] 2 0.4972244
[3,] 3 0.4952821
[4,] 4 0.5106539
[5,] 5 0.4717122
[6,] 6 0.5110490
[7,] 7 0.4507383
[8,] 8 0.4680827
[9,] 9 0.5150618
[10,] 10 0.4915999
[11,] 11 0.5107873
[12,] 12 0.4451787
Now update by reference (:=) the sum_flux column, where sum_flux is NA, with the value from avg for that month.
readings[is.na(sum_flux), sum_flux:=avg$V1[month(day)]]
day sum_flux samples mean
[1,] 2005-10-26 0.32838686 94 0.09647325
[2,] 2005-10-27 0.14686591 88 0.48728321
[3,] 2005-10-28 0.25800913 51 0.72776002
[4,] 2005-10-29 0.09628937 81 0.80954124
[5,] 2005-10-30 0.70721591 23 0.60165240
[6,] 2005-10-31 0.59555079 2 0.96849533
[7,] 2005-11-01 0.51078729** 42 0.37566491 # ** updated with the Nov avg
[8,] 2005-11-02 0.01649860 89 0.48866220
[9,] 2005-11-03 0.46802818 49 0.28920807
[10,] 2005-11-04 0.13024856 30 0.29051080
First 10 rows of 1827 printed.
Done.

Resources