Related
My data set is about forest fires and NDVI values (a value ranging from 0 to 1, indicating how green is the surface). It has an initial column which says when the forest fire of row one took place, and subsequent columns indicating the NDVI value on different dates, before and after the fire happened. NDVI values before the fire are substantially higher compared with values after the fire. Something like:
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250),
stringsAsFactors = FALSE)
> data1989
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
I would like to compute the average of NDVI values, in a new column, PRIOR to the forest fire. In case one, it would be the average of columns 2, 3, 4 and 5.
What I need to get is:
date_fire X1986.01.01 X1986.06.03 X1986.10.19 X1987.01.19 X1987.06.19 X1987.10.19 X1988.01.19 X1988.06.19 X1988.10.19 meanPreFire
1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.653
2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.559
3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764
Thanks!
EDIT: SOLUTION
How to adapt the code with more than one column to exclude:
data1989 <- data.frame("date_fire" = c("1987-02-01", "1987-07-03", "1988-01-01"),
"type" = c("oak", "pine", "oak"),
"meanRainfall" = c(600, 300, 450),
"1986.01.01" = c(0.5, 0.589, 0.66),
"1986.06.03" = c(0.56, 0.447, 0.75),
"1986.10.19" = c(0.8, NA, 0.83),
"1987.01.19" = c(0.75, 0.65,0.75),
"1987.06.19" = c(0.1, 0.55,0.811),
"1987.10.19" = c(0.15, 0.12, 0.780),
"1988.01.19" = c(0.2, 0.22,0.32),
"1988.06.19" = c(0.18, 0.21,0.23),
"1988.10.19" = c(0.21, 0.24, 0.250),
check.names = FALSE,
stringsAsFactors = FALSE)
Using:
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-(1:3)],format="%Y.%m.%d"))
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
data1989$meanPreFire <- tapply(data1989[-(1:3)][m1], m1[,1], FUN = mean, na.rm = TRUE)
> data1989
date_fire type meanRainfall 1986.01.01 1986.06.03 1986.10.19 1987.01.19 1987.06.19 1987.10.19 1988.01.19 1988.06.19 1988.10.19 meanPreFire
1 1987-02-01 oak 600 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21 0.6525
2 1987-07-03 pine 300 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24 0.5590
3 1988-01-01 oak 450 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.7635
Reshape data to the long form and filter dates prior to the forest fire.
library(tidyverse)
data1989 %>%
pivot_longer(-date_fire, names_to = "date") %>%
mutate(date_fire = as.Date(date_fire),
date = as.Date(date, "X%Y.%m.%d")) %>%
filter(date < date_fire) %>%
group_by(date_fire) %>%
summarise(meanPreFire = mean(value, na.rm = T))
# # A tibble: 3 x 2
# date_fire meanPreFire
# <date> <dbl>
# 1 1987-01-01 0.62
# 2 1987-07-03 0.559
# 3 1988-01-01 0.764
The solution would be much more concise if we would keep the data in long(er) form... but this reproduces the desired output:
library(dplyr)
library(tidyr)
data1989 %>%
pivot_longer(-date_fire, names_to = "date_NDVI", values_to = "value", names_prefix = "^X") %>%
mutate(date_fire = as.Date(date_fire, "%Y-%m-%d"),
date_NDVI = as.Date(date_NDVI, "%Y.%m.%d")) %>%
group_by(date_fire) %>%
mutate(period = ifelse(date_NDVI < date_fire, "before_fire", "after_fire")) %>%
group_by(date_fire, period) %>%
mutate(average_NDVI = mean(value, na.rm = TRUE)) %>%
pivot_wider(names_from = date_NDVI, names_prefix = "X", values_from = value) %>%
pivot_wider(names_from = period, values_from = average_NDVI) %>%
group_by(date_fire) %>%
summarise_all(funs(sum(., na.rm=T)))
Returns:
# A tibble: 3 x 12
date_fire `X1986-01-01` `X1986-06-03` `X1986-10-19` `X1987-01-19` `X1987-06-19` `X1987-10-19` `X1988-01-19` `X1988-06-19` `X1988-10-19` before_fire after_fire
<date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1987-01-01 0.5 0.56 0.8 0.75 0.1 0.15 0.2 0.18 0.21 0.62 0.265
2 1987-07-03 0.589 0.447 0 0.65 0.55 0.12 0.22 0.21 0.24 0.559 0.198
3 1988-01-01 0.66 0.75 0.83 0.75 0.811 0.78 0.32 0.23 0.25 0.764 0.267
Edit:
If we stop the expression right after calculating the averages we can use the data in this structure to easily calculate the variance or account for variable number of observations. I think it's ok to keep the date_fireas its own column, but I'd suggest leaving the other dates as a column (because they correspond to observations). Especially if we want to do more analysis with the data using ggplot2 and other tidyverse functions.
We can use base R, by creating a row/column index. The column index can be got from findInterval with the column names and the 'date_fire'
j1 <- findInterval(as.Date(data1989$date_fire), as.Date(names(data1989)[-1]))
l1 <- lapply(j1+1, `:`, ncol(data1989)-1)
m1 <- cbind(rep(seq_len(nrow(data1989)), j1), sequence(j1))
m2 <- cbind(rep(seq_len(nrow(data1989)), lengths(l1)), unlist(l1))
data1989$meanPreFire <- tapply(data1989[-1][m1], m1[,1], FUN = mean, na.rm = TRUE)
data1989$meanPostFire <- tapply(data1989[-1][m2], m2[,1], FUN = mean, na.rm = TRUE)
data1989
# date_fire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19 1988-10-19
#1 1987-01-01 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18 0.21
#2 1987-07-03 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21 0.24
#3 1988-01-01 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23 0.25
# meanPreFire meanPostFire
#1 0.6200 0.2650000
#2 0.5590 0.1975000
#3 0.7635 0.2666667
Or using melt/dcast from data.table
library(data.table)
dcast(melt(setDT(data1989), id.var = 'date_fire')[,
.(value = mean(value, na.rm = TRUE)),
.(date_fire, grp = c('postFire', 'preFire')[1 + (as.IDate(variable) < as.IDate(date_fire))]) ], date_fire ~ grp)[data1989, on = .(date_fire)]
# date_fire postFire preFire 1986-01-01 1986-06-03 1986-10-19 1987-01-19 1987-06-19 1987-10-19 1988-01-19 1988-06-19
#1: 1987-01-01 0.2650000 0.6200 0.500 0.560 0.80 0.75 0.100 0.15 0.20 0.18
#2: 1987-07-03 0.1975000 0.5590 0.589 0.447 NA 0.65 0.550 0.12 0.22 0.21
#3: 1988-01-01 0.2666667 0.7635 0.660 0.750 0.83 0.75 0.811 0.78 0.32 0.23
# 1988-10-19
#1: 0.21
#2: 0.24
#3: 0.25
data
data1989 <- data.frame("date_fire" = c("1987-01-01", "1987-07-03", "1988-01-01"),
"1986-01-01" = c(0.5, 0.589, 0.66),
"1986-06-03" = c(0.56, 0.447, 0.75),
"1986-10-19" = c(0.8, NA, 0.83),
"1987-01-19" = c(0.75, 0.65,0.75),
"1987-06-19" = c(0.1, 0.55,0.811),
"1987-10-19" = c(0.15, 0.12, 0.780),
"1988-01-19" = c(0.2, 0.22,0.32),
"1988-06-19" = c(0.18, 0.21,0.23),
"1988-10-19" = c(0.21, 0.24, 0.250), check.names = FALSE,
stringsAsFactors = FALSE)
I need to do a diagonal multiplication for below table.
It's a 7*7 matrix:
Step 1: need a diagonal multiplcation for 7*7 matrix,
Step 2: then ignore the first column and select the next 7 columns and 7 rows and do diagonal multiplication.
Step 3: ignore the 1st & 2nd column and select the next 7 columns and 7 rows and do diagonal multiplication.
Step 4: similar to step 3 and increment the column ignore 1,2,3 .... and so on and so far ....
Note: the diagonal will be going in upward direct from right side Bottom to the left upper side.
Data:
28/02/2013 31/03/2013 30/04/2013 31/05/2013 30/06/2013 31/07/2013 31/08/2013 30/09/2013 31/10/2013 30/11/2013 31/12/2013 31/01/2014 28/02/2014
0.04 0.03 0.03 0.04 0.04 0.07 0.86 0.28 0.05 0.05 0.05 0.04 0.04
0.44 0.44 0.42 0.43 0.40 0.32 0.64 0.02 0.33 0.36 0.30 0.27 0.37
0.57 0.57 0.52 0.59 0.62 0.51 0.79 0.23 0.64 0.66 0.50 0.55 0.60
0.61 0.58 0.60 0.63 0.65 0.59 0.81 0.83 1.00 0.63 0.57 0.63 0.74
0.70 0.65 0.66 0.71 0.73 0.66 0.86 0.90 0.55 0.76 0.65 0.66 0.74
0.76 0.76 0.79 0.74 0.83 0.83 0.86 1.00 0.61 0.83 0.38 0.74 0.75
0.80 0.84 0.89 0.84 0.82 0.83 0.98 0.84 0.44 0.93 0.88 0.78 0.78
Considering each column as A, B, C, D, E, F, G, H, I, J, K and so on ... there will be many columns, but the number of rows will be only 7.
Calculation of the 7*7 daigonal matrix will be as follows.
A is result for -> STEP 1, B -> STEP 2 AND C -> STEP 3 ... and so on.
A B C
G8*F7*E6*D5*C4*B3*A2 = 0.00 H8*G7*F6*E5*D4*C3*B2 = 0.02 I8*H7*G6*F5*E4*D3*C2 = 0.00
G8*F7*E6*D5*C4*B3 = 0.08 H8*G7*F6*E5*D4*C3 = 0.08 I8*H7*G6*F5*E4*D3 = 0.06
G8*F7*E6*D5*C4 = 0.19 H8*G7*F6*E5*D4 = 0.18 I8*H7*G6*F5*E4 = 0.14
G8*F7*E6*D5 = 0.37 H8*G7*F6*E5 = 0.31 I8*H7*G6*F5 = 0.22
G8*F8*E6 = 0.59 H8*G7*F6 = 0.47 I8*H7*G6 = 0.38
G8*F8 = 0.81 H8*G7 = 0.72 I8*H7 = 0.44
G8 = 0.98 H8 = 0.84 I8 = 0.44
So result should be printed as.
A B C
0 0.02 0.00
0.08 0.08 0.06
0.19 0.18 0.14
0.37 0.31 0.22
0.59 0.47 0.38
0.81 0.72 0.44
0.98 0.84 0.44
Similary there will result for D, E, F, and so on.
Please help, Thanks in Advance.
sapply(lapply(7:NCOL(df), function(i)
df[, (i-6):i]), function(a)
round(x = rev(cumprod(rev(diag(as.matrix(a))))), digits = 2))
# [,1] [,2] [,3] [,4] [,5] [,6] [,7]
#[1,] 0.00 0.00 0.00 0.00 0.00 0.00 0.00
#[2,] 0.09 0.08 0.06 0.08 0.08 0.03 0.00
#[3,] 0.19 0.18 0.14 0.21 0.26 0.05 0.15
#[4,] 0.37 0.31 0.22 0.41 0.33 0.23 0.24
#[5,] 0.59 0.48 0.38 0.51 0.40 0.23 0.38
#[6,] 0.81 0.72 0.44 0.57 0.73 0.30 0.58
#[7,] 0.98 0.84 0.44 0.93 0.88 0.78 0.78
Let me know if the output is correct
DATA
df = structure(list(A = c(0.04, 0.44, 0.57, 0.61, 0.7, 0.76, 0.8),
B = c(0.03, 0.44, 0.57, 0.58, 0.65, 0.76, 0.84), C = c(0.03,
0.42, 0.52, 0.6, 0.66, 0.79, 0.89), D = c(0.04, 0.43, 0.59,
0.63, 0.71, 0.74, 0.84), E = c(0.04, 0.4, 0.62, 0.65, 0.73,
0.83, 0.82), F = c(0.07, 0.32, 0.51, 0.59, 0.66, 0.83, 0.83
), G = c(0.86, 0.64, 0.79, 0.81, 0.86, 0.86, 0.98), H = c(0.28,
0.02, 0.23, 0.83, 0.9, 1, 0.84), I = c(0.05, 0.33, 0.64,
1, 0.55, 0.61, 0.44), J = c(0.05, 0.36, 0.66, 0.63, 0.76,
0.83, 0.93), K = c(0.05, 0.3, 0.5, 0.57, 0.65, 0.38, 0.88
), L = c(0.04, 0.27, 0.55, 0.63, 0.66, 0.74, 0.78), M = c(0.04,
0.37, 0.6, 0.74, 0.74, 0.75, 0.78)), .Names = c("A", "B",
"C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M"), class = "data.frame", row.names = c(NA,
-7L))
I think a for loop is a good bet here - inspired from this
n <- nrow(df)
b <- ncol(df) - n + 1
out <- matrix(0, n, b)
ro <- 1:n
for(i in 1:b){
co <- i:(n + i - 1)
out[ro, i] <- rev(cumprod(rev(df[cbind(ro, co)])))
}
# [,1] [,2] [,3] [,4] [,5] [,6]
# [1,] 0.003423605 0.002303868 0.001785601 0.003374663 0.00337162 0.00232112
# [2,] 0.085590113 0.076795599 0.059520050 0.084366587 0.08429050 0.03315886
# [3,] 0.194522983 0.182846664 0.138418720 0.210916467 0.26340780 0.05181072
# [4,] 0.374082660 0.309909600 0.223256000 0.413561700 0.33342760 0.22526400
# [5,] 0.593782000 0.476784000 0.378400000 0.510570000 0.40172000 0.22526400
# [6,] 0.813400000 0.722400000 0.440000000 0.567300000 0.73040000 0.29640000
# [7,] 0.980000000 0.840000000 0.440000000 0.930000000 0.88000000 0.78000000
Wrap the answer in round to alter how it is printed.
Another way , also using indexing...
ro <- nrow(df)
co <- ncol(df)
b <- co - ro + 1
id <- pmin(ro, b)
ccols <- mapply(seq, 1:b, id:co)
rrows <- rep(1:ro, b)
mat <- matrix(rev(df[cbind(rrows, c(ccols))]), nr=ro)
matrix(rev(matrixStats::colCumprods(mat)), nr=ro)
A quick benchmark on larger data seems to show that method two is considerably faster, however, if you convert the dataframe to a matrix then the for loop has similar speed
I have a table like below, would like to crate suggestions based on row value in R.
This is what I have -
id class1 class2 class3 class4
A 0.98 0.48 0.21 0.99
B 0.22 0.31 0.41 0.11
C 0.70 0.81 0.61 0.21
I would like to have two new columns ('sugg1', 'sugg2') that will give the column names of the two top maximum values for each row i.e. for the first row, 0.99 is the maximum value, so its corresponding column name is class4, and the next max value is 0.98 for which the column name is class1.
id class1 class2 class3 class4 sugg1 sugg2
A 0.98 0.48 0.21 0.99 class4 class1
B 0.22 0.31 0.41 0.11 class3 class2
C 0.70 0.81 0.61 0.21 class2 class1
We can use apply with MARGIN = 1 to loop over the rows, sort the values in the rows decreasing, get the first 2 (head(...)), transpose the output and create two new columns in the original dataset.
df1[paste0("sugg", 1:2)] <- t(apply(df1[-1], 1, FUN = function(x) names(head(sort(-x),2))))
df1
# id class1 class2 class3 class4 sugg1 sugg2
#1 A 0.98 0.48 0.21 0.99 class4 class1
#2 B 0.22 0.31 0.41 0.11 class3 class2
#3 C 0.70 0.81 0.61 0.21 class2 class1
This can also be done by melting into 'long' format, subset the first two rows after grouping by 'id'/ordering based on 'value' and then join on the original dataset
library(data.table)#v1.9.7+
df1[dcast(melt(df1, id.var = "id")[order(-value), head(variable,2) ,
id], id ~paste0("sugg", rowid(id)), value.var = "V1"), on = "id"]
# id class1 class2 class3 class4 sugg1 sugg2
#1: A 0.98 0.48 0.21 0.99 class4 class1
#2: B 0.22 0.31 0.41 0.11 class3 class2
#3: C 0.70 0.81 0.61 0.21 class2 class1
data
df1 <- structure(list(id = c("A", "B", "C"), class1 = c(0.98, 0.22,
0.7), class2 = c(0.48, 0.31, 0.81), class3 = c(0.21, 0.41, 0.61
), class4 = c(0.99, 0.11, 0.21)), .Names = c("id", "class1",
"class2", "class3", "class4"), class = "data.frame",
row.names = c(NA, -3L))
I have a data frame with monthly returns and their corresponding month.
Data <- read.csv("C:/Users/h/Desktop/overflow.csv", sep=";", dec=",")
Data$Date <- as.Date(as.character(Data$Date), format="%Y-%m-%d")
The data frame looks like this now:
> Data
Fund.A Fund.B Fund.C Fund.D
2012-01-01 -0.01 0.04 0.11 0.10
2012-02-01 -0.04 -0.06 0.08 0.11
2012-03-01 -0.04 -0.07 0.15 -0.03
2012-04-01 0.00 -0.08 -0.04 0.13
2012-05-01 -0.07 0.10 0.06 0.02
2012-06-01 -0.05 0.06 0.06 -0.02
2012-07-01 0.12 -0.06 -0.09 -0.06
2012-08-01 0.08 -0.03 0.05 0.13
2012-09-01 0.10 0.07 -0.02 0.15
2012-10-01 -0.08 0.14 0.00 -0.04
2012-11-01 -0.09 0.11 -0.07 0.12
2012-12-01 -0.01 -0.09 0.07 -0.02
Now I want to continue the time series with new returns from a new csv, by simply matching the new return with the appropriate Fund in "Data". My problem is that new assets might have been added, messing up the order.
import <- read.csv("C:/Users/h/Desktop/import.csv", sep=";", dec=",")
import
2013-01-01
1 Funds: NA
2 Fund A 0.04
3 Fund AA -0.09
4 Fund C -0.10
5 Fund D 0.03
6 Fund B 0.14
As you can see, the "import" csv has new assets (Fund AA) as well as assets seen in "Data" (Fund a to D), where the funds are in rows and not columns. How can I write a code, which matches and adds a row to "Data" where the values in "import" falls under the right column (Fund) in "Data"? And if a new asset have been added, creates a column for the new asset?
As a bonus, the code would only add a row if the date in "import" is more recent date than the most recent one in "Data". To only import new returns.
Appreciate it!
For time series purpose, I would recommend using xts. It makes life a bit easier. Borrowing from Arun's usable data:
olddata <- structure(list(Date = structure(c(15340, 15371, 15400, 15431,
15461, 15492, 15522, 15553, 15584, 15614, 15645, 15675), class = "Date"),
Fund.A = c(-0.01, -0.04, -0.04, 0, -0.07, -0.05, 0.12, 0.08, 0.1, -0.08,
-0.09, -0.01), Fund.B = c(0.04, -0.06, -0.07, -0.08, 0.1, 0.06, -0.06,
-0.03, 0.07, 0.14, 0.11, -0.09), Fund.C = c(0.11, 0.08, 0.15, -0.04,
0.06, 0.06, -0.09, 0.05, -0.02, 0, -0.07, 0.07), Fund.D = c(0.1, 0.11,
-0.03, 0.13, 0.02, -0.02, -0.06, 0.13, 0.15, -0.04, 0.12, -0.02)),
.Names = c("Date", "Fund.A", "Fund.B", "Fund.C", "Fund.D"),
row.names = c(NA, 12L), class = "data.frame")
newimport <- structure(list(funds = c("Fund.A", "Fund.AA", "Fund.C",
"Fund.D", "Fund.B"), `2013-01-01` = c(0.04, -0.09, -0.1, 0.03, 0.14)),
.Names = c("funds", "2013-01-01"), row.names = c(NA, -5L),
class = "data.frame")
Convert data to xts for easy datewise subsetting:
olddata <- xts(olddata[,-1], olddata$Date)
newdata <- xts(t(newimport[,-1]), as.Date(colnames(newimport)[-1]))
colnames(newdata) <- newimport[,1]
Merge data together while taking care of any new columns:
cols <- names(newdata) %in% names(olddata)
combineData <- merge(rbind(olddata, newdata[,cols]), newdata[,!cols])
combineData
Fund.A Fund.B Fund.C Fund.D Fund.AA
2012-01-01 -0.01 0.04 0.11 0.10 NA
2012-02-01 -0.04 -0.06 0.08 0.11 NA
2012-03-01 -0.04 -0.07 0.15 -0.03 NA
2012-04-01 0.00 -0.08 -0.04 0.13 NA
2012-05-01 -0.07 0.10 0.06 0.02 NA
2012-06-01 -0.05 0.06 0.06 -0.02 NA
2012-07-01 0.12 -0.06 -0.09 -0.06 NA
2012-08-01 0.08 -0.03 0.05 0.13 NA
2012-09-01 0.10 0.07 -0.02 0.15 NA
2012-10-01 -0.08 0.14 0.00 -0.04 NA
2012-11-01 -0.09 0.11 -0.07 0.12 NA
2012-12-01 -0.01 -0.09 0.07 -0.02 NA
2013-01-01 0.04 0.14 -0.10 0.03 -0.09
So my test data looks like this:
structure(list(day = c(1L, 1L, 2L, 2L, 2L, 3L, 3L, 4L, 4L, 4L
), Left = c(0.25, 0.33, 0, 0, 0.25, 0.33, 0.5, 0.33, 0.5, 0),
Left1 = c(NA, NA, 0, 0.5, 0.25, 0.33, 0.1, 0.33, 0.5, 0),
Middle = c(0, 0, 0.3, 0, 0.25, 0, 0.3, 0.33, 0, 0), Right = c(0.25,
0.33, 0.3, 0.5, 0.25, 0.33, 0.1, 0, 0, 0.25), Right1 = c(0.5,
0.33, 0.3, 0, 0, 0, 0, 0, 0, 0.75), Side = structure(c(2L,
2L, 2L, 2L, 1L, 1L, 1L, 1L, 1L, 2L), .Label = c("L", "R"), class = "factor")), .Names = c("day",
"Left", "Left1", "Middle", "Right", "Right1", "Side"), class = "data.frame", row.names = c(NA,
-10L))
or this:
day Left Left1 Middle Right Right1 Side
1 0.25 NA 0.00 0.25 0.50 R
1 0.33 NA 0.00 0.33 0.33 R
2 0.00 0.00 0.30 0.30 0.30 R
2 0.00 0.50 0.00 0.50 0.00 R
2 0.25 0.25 0.25 0.25 0.00 L
3 0.33 0.33 0.00 0.33 0.00 L
I would like to write a loop to find the standard error and average value for each day on the chosen side..
Ok.. So far I have this code:
td<-read.csv('test data.csv')
IDs<-unique(td$day)
se<-function(x) sqrt(var(x)/length(x))
for (i in 1:length (IDs)) {
day.i<-which(td$day==IDs[i])
td.i<-td[day.i,]
if(td$Side=='L'){
side<-cbind(td.i$Left + td.i$Left1)
}else{
side<-cbind(td.i$Right + td.i$Right1)
}
mean(side)
se(side)
print(mean)
print(se)
}
But I am getting error messages like this
Error: unexpected '}' in "}"
Obviously, I am also not getting the print out of means for each day.. Does anyone know why?
also working on things here: http://www.talkstats.com/showthread.php/27187-Writing-a-mean-loop..-(literally)
Convert your data into a list and work with that instead:
First, split up your data into a list according to Side, subsetting the relevant columns along the way.
td = split(td, td$Side)
NAMES = names(td)
td = lapply(1:length(td),
function(x) td[[x]][c(1, grep(NAMES[x],
names(td[[x]])))])
names(td) = NAMES
td
# $L
# day Left Left1
# 5 2 0.25 0.25
# 6 3 0.33 0.33
# 7 3 0.50 0.10
# 8 4 0.33 0.33
# 9 4 0.50 0.50
#
# $R
# day Right Right1
# 1 1 0.25 0.50
# 2 1 0.33 0.33
# 3 2 0.30 0.30
# 4 2 0.50 0.00
# 10 4 0.25 0.75
Then, use lapply and aggregate to apply whatever functions you want to your data.
lapply(1:length(td),
function(x) aggregate(list(td[[x]][-1]),
list(day = td[[x]]$day), mean))
# [[1]]
# day Left Left1
# 1 2 0.250 0.250
# 2 3 0.415 0.215
# 3 4 0.415 0.415
#
# [[2]]
# day Right Right1
# 1 1 0.29 0.415
# 2 2 0.40 0.150
# 3 4 0.25 0.750
Still not entirely sure if I understand (that is if you want mean and SE for both Left and Left 1 or some sort of combination like sum). This is how I interpreted your question:
FUN <- function(dat, side = "L") {
DF <- split(dat, dat$Side)[[side]]
ind <- if(side=="L") 2:3 else 5:6
stderr <- function(x) sqrt(var(x)/length(x))
meanNse <- function(x) c(mean=mean(x), se=stderr(x))
OUT <- aggregate(DF[, ind], list(DF[, 1]), meanNse)
names(OUT)[1] <- "day"
return(OUT)
}
#test it
FUN(td)
FUN(td, "R")
Which yields:
> FUN(td)
day Left.mean Left.se Left1.mean Left1.se
1 2 0.250 NA 0.250 NA
2 3 0.415 0.085 0.215 0.115
3 4 0.415 0.085 0.415 0.085
> FUN(td, "R")
day Right.mean Right.se Right1.mean Right1.se
1 1 0.29 0.04 0.415 0.085
2 2 0.40 0.10 0.150 0.150
3 4 0.25 NA 0.750 NA