DATA AND REQUIREMENTS
The first table (myMatrix1) is from an old geological survey that used different region boundaries (begin and finish) columns to the newer survey.
What I wish to do is to match the begin and finish boundaries and then create two tables one for the new data on sedimentation and one for the new data on bore width characterised as a boolean.
myMatrix1 <- read.table("/path/to/file")
myMatrix2 <- read.table("/path/to/file")
> head(myMatrix1) # this is the old data
sampleIDs begin finish
1 19990224 4 5
2 20000224 5 6
3 20010203 6 8
4 20019024 29 30
5 20020201 51 52
> head(myMatrix2) # this is the new data
begin finish sedimentation boreWidth
1 0 10 1.002455 0.014354
2 11 367 2.094351 0.056431
3 368 920 0.450275 0.154105
4 921 1414 2.250820 1.004353
5 1415 5278 0.114109 NA`
Desired output:
> head(myMatrix6)
sampleIDs begin finish sedimentation #myMatrix4
1 19990224 4 5 1.002455
2 20000224 5 6 1.002455
3 20010203 6 8 2.094351
4 20019024 29 30 2.094351
5 20020201 51 52 2.094351
> head(myMatrix7)
sampleIDs begin finish boreWidthThresh #myMatrix5
1 19990224 4 5 FALSE
2 20000224 5 6 FALSE
3 20010203 6 8 FALSE
4 20019024 29 30 FALSE
5 20020201 51 52 FALSE`
CODE
The following code has taken me several hours to run on my dataset (about 5 million data points). Is there any way to change the code to make it run any faster?
# create empty matrix for sedimentation
myMatrix6 <- data.frame(NA,NA,NA,NA)[0,]
names(myMatrix6) <- letters[1:4]
# create empty matrix for bore
myMatrix7 <- data.frame(NA,NA,NA,NA)[0,]
names(myMatrix7) <- letters[1:4]
for (i in 1:nrow(myMatrix2))
{
# create matrix that has the value of myMatrix1$begin being
# situated between the values of myMatrix2begin[i] and myMatrix2finish[i]
myMatrix3 <- myMatrix1[which((myMatrix1$begin > myMatrix2$begin[i]) & (myMatrix1$begin < myMatrix2$finish[i])),]
myMatrix4 <- rep(myMatrix2$sedimentation, nrow(myMatrix3))
if (is.na(myMatrix2$boreWidth[i])) {
myMatrix5 <- rep(NA, nrow(myMatrix3))
}
else if (myMatrix2$boreWidth[i] == 0) {
myMatrix5 <- rep(TRUE, nrow(myMatrix3))
}
else if (myMatrix2$boreWidth[i] > 0) {
myMatrix5 <- rep(FALSE, nrow(myMatrix3))
}
myMatrix6 <- rbind(myMatrix6, cbind(myMatrix3, myMatrix4))
myMatrix7 <- rbind(myMatrix7, cbind(myMatrix3, myMatrix5))
}
EDIT:
> dput(head(myMatrix2)
structure(list(V1 = structure(c(6L, 1L, 2L, 4L, 5L, 3L), .Label = c("0",
"11", "1415", "368", "921", "begin"), class = "factor"), V2 = structure(c(6L,
1L, 3L, 5L, 2L, 4L), .Label = c("10", "1414", "367", "5278",
"920", "finish"), class = "factor"), V3 = structure(c(6L, 3L,
4L, 2L, 5L, 1L), .Label = c("0.114109", "0.450275", "1.002455",
"2.094351", "2.250820", "sedimentation"), class = "factor"),
V4 = structure(c(5L, 1L, 2L, 3L, 4L, 6L), .Label = c("0.014354",
"0.056431", "0.154105", "1.004353", "boreWidth", "NA"), class = "factor")), .Names = c("V1",
"V2", "V3", "V4"), row.names = c(NA, 6L), class = "data.frame")
> dput(head(myMatrix1)
structure(list(V1 = structure(c(6L, 1L, 2L, 3L, 4L, 5L), .Label = c("19990224",
"20000224", "20010203", "20019024", "20020201", "sampleIDs"), class = "factor"),
V2 = structure(c(6L, 2L, 3L, 5L, 1L, 4L), .Label = c("29",
"4", "5", "51", "6", "begin"), class = "factor"), V3 = structure(c(6L,
2L, 4L, 5L, 1L, 3L), .Label = c("30", "5", "52", "6", "8",
"finish"), class = "factor")), .Names = c("V1", "V2", "V3"
), row.names = c(NA, 6L), class = "data.frame")
First look at these general suggestions on speeding up code: https://stackoverflow.com/a/8474941/636656
The first thing that jumps out at me is that I'd create only one results matrix. That way you're not duplicating the sampleIDs begin finish columns, and you can avoid any overhead that comes with running the matching algorithm twice.
Doing that, you can avoid selecting more than once (although it's trivial in terms of speed as long as you store your selection vector rather than re-calculate).
Here's a solution using apply:
myMatrix1 <- data.frame(sampleIDs=c(19990224,20000224),begin=c(4,5),finish=c(5,6))
myMatrix2 <- data.frame(begin=c(0,11),finish=c(10,367),sed=c(1.002,2.01),boreWidth=c(.014,.056))
glommer <- function(x,myMatrix2) {
x[4:5] <- as.numeric(myMatrix2[ myMatrix2$begin <= x["begin"] & myMatrix2$finish >= x["finish"], c("sed","boreWidth") ])
names(x)[4:5] <- c("sed","boreWidth")
return( x )
}
> t(apply( myMatrix1, 1, glommer, myMatrix2=myMatrix2))
sampleIDs begin finish sed boreWidth
[1,] 19990224 4 5 1.002 0.014
[2,] 20000224 5 6 1.002 0.014
I used apply and stored everything as numeric. Other approaches would be to return a data.frame and have the sampleIDs and begin, finish be ints. That might avoid some problems with floating point error.
This solution assumes there are no boundary cases (e.g. the begin, finish times of myMatrix1 are entirely contained within the begin, finish times of the other). If your data is more complicated, just change the glommer() function. How you want to handle that is a substantive question.
Related
I have a list of lists similar to the toy example given here. I would like to loop through this list to return a new list which has had elements removed based on a variable.
dput(head(list)):
list(FEB_gems = list(GAME1 = structure(list(GAME1_Class = structure(c(2L,
1L, 5L, 4L, 3L), .Label = c("fighter", "paladin", "rouge", "sorcerer",
"wizard"), class = "factor"), GAME1_Race = structure(c(3L, 1L,
4L, 3L, 2L), .Label = c("elf", "gnome", "human", "orc"), class = "factor"),
GAME1_Alignment = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("CE",
"CG", "LG", "NE", "NN"), class = "factor"), GAME1_Level = c(6,
7, 6, 7, 7), GAME1_Alive = structure(c(1L, 1L, 1L, 1L, 1L
), .Label = "y", class = "factor")), class = "data.frame", row.names = c(NA,
-5L)), GAME2 = structure(list(GAME2_Class = structure(c(3L, 5L,
2L, 4L, 1L), .Label = c("bard", "cleric", "fighter", "monk",
"wizard"), class = "factor"), GAME2_Race = structure(c(2L, 3L,
2L, 4L, 1L), .Label = c("dwarf", "elf", "half-elf", "human"), class = "factor"),
GAME2_Alignment = structure(c(4L, 2L, 1L, 5L, 3L), .Label = c("CE",
"CG", "LG", "NE", "NN"), class = "factor"), GAME2_Level = c(5,
5, 5, 5, 5), GAME2_Alive = structure(c(1L, 2L, 2L, 2L, 2L
), .Label = c("n", "y"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))), MAR_gems = list(GAME3 = structure(list(GAME3_Class = structure(c(2L,
1L, 5L, 4L, 3L), .Label = c("barbarian", "cleric", "monk", "ranger",
"warlock"), class = "factor"), GAME3_Race = structure(c(2L, 3L,
2L, 4L, 1L), .Label = c("dwarf", "elf", "half-elf", "human"), class = "factor"),
GAME3_Alignment = structure(c(2L, 2L, 1L, 3L, 2L), .Label = c("CE",
"LG", "LN"), class = "factor"), GAME3_Level = c(1, 1, 1,
1, 1), GAME3_Alive = structure(c(2L, 2L, 2L, 1L, 2L), .Label = c("n",
"y"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L)), GAME4 = structure(list(GAME4_Class = structure(c(2L, 1L,
5L, 4L, 3L), .Label = c("fighter", "paladin", "rouge", "sorcerer",
"wizard"), class = "factor"), GAME4_Race = structure(c(2L, 3L,
2L, 4L, 1L), .Label = c("dwarf", "elf", "half-elf", "human"), class = "factor"),
GAME4_Alignment = structure(c(1L, 2L, 1L, 4L, 3L), .Label = c("CE",
"CG", "LG", "LN"), class = "factor"), GAME4_Level = c(5,
5, 5, 5, 5), GAME4_Alive = structure(c(1L, 2L, 2L, 2L, 2L
), .Label = c("n", "y"), class = "factor")), class = "data.frame", row.names = c(NA,
-5L))))
I have made some attempt at manually sub-setting the list of lists. A function would be preferable is because I have multiple types of data to subset.
1) sub-setting Level columns based on interger
df1 <- Games.Split[[1]][[1]]
Level <- df1[which(df1[4] > 6),]
Games.Split[[1]][[1]] <- Level
df1:
GAME1_Class GAME1_Race GAME1_Alignment GAME1_Level GAME1_Alive
1 paladin human NE 6 y
2 fighter elf CG 7 y
3 wizard orc CE 6 y
4 sorcerer human NN 7 y
5 rouge gnome LG 7 y
Level:
GAME1_Class GAME1_Race GAME1_Alignment GAME1_Level GAME1_Alive
2 fighter elf CG 7 y
4 sorcerer human NN 7 y
5 rouge gnome LG 7 y
2) sub-setting Alive columns based on string
df2 <- Games.Split[[1]][[2]]
Alive <- df2[which(df2[5] == 'y'),]
Games.Split[[1]][[2]] <- Alive
df2:
GAME2_Class GAME2_Race GAME2_Alignment GAME2_Level GAME2_Alive
1 fighter elf NE 5 n
2 wizard half-elf CG 5 y
3 cleric elf CE 5 y
4 monk human NN 5 y
5 bard dwarf LG 5 y
Alive:
GAME2_Class GAME2_Race GAME2_Alignment GAME2_Level GAME2_Alive
2 wizard half-elf CG 5 y
3 cleric elf CE 5 y
4 monk human NN 5 y
5 bard dwarf LG 5 y
However I'm struggling to put this into action in a for loop to perform these sub-setting tasks on the entire list.
for (i in Games.Split){
for (j in i){
Alive = j[which(j[5] == 'y'),]
j <- Alive
}
}
Overall, a function/ method that can iterate through the whole list to provide a new list which no longer has the removed elements.
Since you have two levels of lists to organizing the dataframes, this will require a nested list apply function (lapply)-- same as a loop but a little neater. Here is an example that creates a function to subset the game list (gameList) based on maximum level (maxLevel):
listSubset <- function(x, maxLevel){
lapply(x, function(ls){
lapply(ls, function(df) df[df[[grep('Level', names(df), value = TRUE)]] < maxLevel, ])
})
}
listSubset(x = gameList, maxLevel = 6)
Output:
$`FEB_gems`
$`FEB_gems`$`GAME1`
[1] GAME1_Class GAME1_Race GAME1_Alignment GAME1_Level GAME1_Alive
<0 rows> (or 0-length row.names)
$`FEB_gems`$GAME2
GAME2_Class GAME2_Race GAME2_Alignment GAME2_Level GAME2_Alive
1 fighter elf NE 5 n
2 wizard half-elf CG 5 y
3 cleric elf CE 5 y
4 monk human NN 5 y
5 bard dwarf LG 5 y
$MAR_gems
$MAR_gems$`GAME3`
GAME3_Class GAME3_Race GAME3_Alignment GAME3_Level GAME3_Alive
1 cleric elf LG 1 y
2 barbarian half-elf LG 1 y
3 warlock elf CE 1 y
4 ranger human LN 1 n
5 monk dwarf LG 1 y
$MAR_gems$GAME4
GAME4_Class GAME4_Race GAME4_Alignment GAME4_Level GAME4_Alive
1 paladin elf CE 5 n
2 fighter half-elf CG 5 y
3 wizard elf CE 5 y
4 sorcerer human LN 5 y
5 rouge dwarf LG 5 y
All the functions are in base R, so no need to install and learn new packages.
If there are only two nested lists, and want different filtering conditions, apply on it individually and assign the output back to the list element. We loop through the master list with map and then apply the logical conditions
library(purrr)
library(dplyr)
lst2 <- map(lst1, ~ {
.x[[1]] <- .x[[1]] %>%
filter_at(4, all_vars(. > 6))
.x[[2]] <- .x[[2]] %>%
filter_at(5, all_vars(. == 'y'))
.x
})
I'd argue that life would be easier if you restructure your data, then use dplyr's filter to pull out what you want (or omit what you don't want). Assuming your original data is called foo:
# Load libraries
library(dplyr)
library(purrr)
# Remove one list
bar <- unlist(foo, recursive = FALSE)
# Get names of campaigns and games
campaign_games <- data.frame(do.call(rbind, strsplit(names(bar), "\\.")))
# Add campaigns and games numbers to data frames
ls_games <- pmap(list(campaign_games[, 1], campaign_games[, 2], bar), cbind)
# Rename all columns
ls_games <- lapply(ls_games, function(x){names(x) <- c("Campaign", "Game_n", "Class", "Race", "Alignment", "Level", "Alive"); x})
# Convert to data frame
df <- bind_rows(ls_games)
# Look at result
head(df)
Now your data looks like this:
# Campaign Game_n Class Race Alignment Level Alive
# 1 FEB_gems GAME1 paladin human NE 6 y
# 2 FEB_gems GAME1 fighter elf CG 7 y
# 3 FEB_gems GAME1 wizard orc CE 6 y
# 4 FEB_gems GAME1 sorcerer human NN 7 y
# 5 FEB_gems GAME1 rouge gnome LG 7 y
# 6 FEB_gems GAME2 fighter elf NE 5 n
which is easy to handle. For example, pull those that are alive in game 1 of FEB gems and are level 7 or higher.
df %>% filter(Alive == "y", Campaign == "FEB_gems",
Level >= 7, Game_n == "GAME1")
# Campaign Game_n Class Race Alignment Level Alive
# 1 FEB_gems GAME1 fighter elf CG 7 y
# 2 FEB_gems GAME1 sorcerer human NN 7 y
# 3 FEB_gems GAME1 rouge gnome LG 7 y
I have a data frame with >300000 rows. I want to select matches to three strings and move those rows that match to the end of the data frame. I need to keep the rows that don't match in the final data frame. In the end, my data will be plotted and the reordered data frame will be written to xls.
Here is some example data:
mydata <- structure(list(id = structure(c(1L, 4L, 1L, 2L, 3L, 2L, 1L, 6L,
5L, 2L, 1L, 3L, 4L), .Label = c("match1", "match2", "match3",
"match4", "match8", "match9"), class = "factor"), A = structure(c(6L,
5L, 7L, 4L, 10L, 7L, 8L, 8L, 9L, 4L, 3L, 2L, 1L), .Label = c("19",
"2", "20", "3", "4", "6", "8", "H", "j", "T"), class = "factor"),
B = structure(c(2L, 2L, 2L, 3L, 4L, 2L, 4L, 5L, 2L, 3L, 5L,
3L, 1L), .Label = c("beside", "in", "out", "over", "under"
), class = "factor")), .Names = c("id", "A", "B"), row.names = c(NA,
-13L), class = "data.frame")
Which looks like this:
id A B
match1 6 in
match4 4 in
match1 8 in
match2 3 out
match3 T over
match2 8 in
match1 H over
match9 H under
match8 j in
match2 3 out
match1 20 under
match3 2 out
match4 19 beside
I want to use this vector of strings to move rows that match to the end of the data frame.
matchlist = c("match1", "match2", "match3")
The resulting data frame would look like this:
id A B
match4 4 in
match9 H under
match8 j in
match4 19 beside
match1 H over
match1 6 in
match1 8 in
match1 20 under
match2 3 out
match2 8 in
match2 3 out
match3 T over
match3 2 out
I need to retain the non-matching rows.
I looked at this post Select and sort rows of a data frame based on a vector but it loses the non-matching data.
Try this:
x <- as.character(df$id) %in% matchlist
rbind(df[!x,], df[x,])
# id A B
# 2 match4 4 in
# 8 match9 H under
# 9 match8 j in
# 13 match4 19 beside
# 1 match1 6 in
# 3 match1 8 in
# 4 match2 3 out
# 5 match3 T over
# 6 match2 8 in
# 7 match1 H over
# 10 match2 3 out
# 11 match1 20 under
# 12 match3 2 out
Consider this short tidyverse solution:
mydata %>%
arrange(id %in% match_list)
Here is a solution without grep:
matched <- mydata$id %in% matchlist
mydata2 <- rbind(mydata[!matched,], mydata[matched,])
You could of course order the matched rows before the rbind, then you would get exactly the same output as in your example.
top = mydata[-grep("match1|match2|match3", mydata$id),]
bottom = mydata[grep("match1|match2|match3", mydata$id),]
bottom = bottom[order(bottom$id),]
xls = rbind(top, bottom)
In sparkR I have a DataFrame data.
When I type head(data) we get this output
C0 C1 C2 C3
1 id user_id foreign_model_id machine_id
2 1 3145 4 12
3 2 4079 1 8
4 3 1174 7 1
5 4 2386 9 9
6 5 5524 1 7
I want to remove C0,C1,C2,C3 because they give me problems later one. For example when I use the filter function:
filter(data,data$machine_id==1)
can't run because of this.
I have read the data like this
data <- read.df(sqlContext, "/home/ole/.../data", "com.databricks.spark.csv")
SparkR made the header into the first row and gave the DataFrame a new header because the default for the header option is "false". Set the header option to header="true" and then you won't have to handle with this problem.
data <- read.df(sqlContext, "/home/ole/.../data", "com.databricks.spark.csv", header="true")
Try
colnames(data) <- unlist(data[1,])
data <- data[-1,]
> data
# id user_id foreign_model_id machine_id
#2 1 3145 4 12
#3 2 4079 1 8
#4 3 1174 7 1
#5 4 2386 9 9
#6 5 5524 1 7
If you wish, you can add rownames(data) <- NULL to correct for the row numbers after the deletion of the first row.
After this manipulation, you can select rows that correspond to certain criteria, like
subset(data, data$machine_id==1)
# id user_id foreign_model_id machine_id
#4 3 1174 7 1
In base R, the function filter() suggested in the OP is part of the stats namespace and is usually reserved for the analysis of time series.
data
data <- structure(list(C0 = structure(c(6L, 1L, 2L, 3L, 4L, 5L),
.Label = c("1", "2", "3", "4", "5", "id"), class = "factor"),
C1 = structure(c(6L, 3L, 4L, 1L, 2L, 5L), .Label = c("1174", "2386",
"3145", "4079", "5524", "user_id"), class = "factor"),
C2 = structure(c(5L, 2L, 1L, 3L, 4L, 1L),
.Label = c("1", "4", "7", "9", "foreign_model_id"), class = "factor"),
C3 = structure(c(6L, 2L, 4L, 1L, 5L, 3L),
.Label = c("1", "12", "7", "8", "9", "machine_id"), class = "factor")),
.Names = c("C0", "C1", "C2", "C3"), class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6"))
try this
names <- c()
for (i in seq(along = names(data))) {
names <- c(names, toString(data[1,i]))
}
names(data) <- names
data <- data[-1,]
I simply can't use the answers because in sparkR it can't run: object of type 'S4' is not subsettable. I solved the problem this way, however, I think there is a better way to solve it.
data <- withColumnRenamed(data, "C0","id")
data <- withColumnRenamed(data, "C1","user_id")
data <- withColumnRenamed(data, "C2","foreign_model_id")
data <- withColumnRenamed(data, "C3","machine_id")
And now I can successfully use the filter function as I want to.
I have a set of dates and times for several individuals (ID) that correspond to our primary outcome measure (Y) and a covariate (X1).
My objective is to replace missing X1 values for each of the Y rows if the X1 measurement was recorded within a +/- 24 hour period from the date/time that the Y variable was measured. To make this easier to visualize (and load into R), here is how the data are currently arranged:
structure(list(ID = c(1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 3L, 3L, 3L, 3L), TIME = structure(1:15, .Label = c("01/01/2013 12:01",
"01/03/2013 08:49", "01/03/2013 20:52", "02/01/2013 05:00", "02/03/2013 05:30",
"02/03/2013 21:14", "02/05/2013 05:15", "02/12/2013 05:03", "02/15/2013 04:16",
"02/16/2013 04:12", "02/16/2013 21:02", "03/01/2010 17:58", "03/02/2010 00:10",
"03/03/2010 10:45", "03/04/2010 09:00"), class = "factor"), Y = structure(c(1L,
5L, 7L, 1L, 1L, 2L, 1L, 1L, 1L, 4L, 3L, 1L, 8L, 1L, 6L), .Label = c(".",
"22", "35", "4", "5", "6", "8", "9"), class = "factor"), X1 = structure(c(2L,
1L, 1L, 7L, 7L, 1L, 4L, 4L, 3L, 1L, 1L, 6L, 1L, 5L, 1L), .Label = c(".",
"0.1", "0.2", "0.4", "0.6", "0.9", "1.0"), class = "factor")), .Names = c("ID",
"TIME", "Y", "X1"), class = "data.frame", row.names = c(NA, -15L))
To simplify the desired output, I would like to only display the rows with non-missing Y values, such that the end product would look like this:
ID TIME Y X1
1 1 01/03/2013 08:49 5 .
2 1 01/03/2013 20:52 8 .
3 2 02/03/2013 21:14 22 .
4 2 02/16/2013 04:12 4 0.2
5 2 02/16/2013 21:02 35 .
6 3 03/02/2010 00:10 9 0.9
7 3 03/04/2010 09:00 6 0.6
Is it possible to (1) iterate across multiple rows and evaluate the absolute value of 24 hours to get the difference between the X1 and Y measurements and (2) to replace the missing values of X1 with those that are within the +/- 24 hour window?
Any thoughts on how to go about this would be greatly appreciated!
if you convert your data into xts then you can use xts's easy subsetting feature to get what you want.
PS: following code will work if you have exactly 1 value of X1 within 24 hour period of Y measurement.
require(xts)
xx <- xts(DF[, c(1, 4, 5)], as.POSIXct(paste0(DF$Date, " ", DF$TIME), format = "%m/%d/%Y %H:%M"))
sapply(index(xx[!is.na(xx$Y)]), FUN = function(tt) {
startTime <- tt - 24 * 60 * 60
endTime <- tt + 24 * 60 * 60
y <- xx[paste(startTime, endTime, sep = "/")]
if (nrow(y[!is.na(y$X1), "X1"]) != 0) {
return(as.vector(y[!is.na(y$X1), "X1"]))
} else {
return(NA)
}
})
## [1] 0.9 0.6 NA NA 1.0 0.2 NA
xx[!is.na(xx$Y), "X1"] <- sapply(index(xx[!is.na(xx$Y)]), FUN = function(tt) {
startTime <- tt - 24 * 60 * 60
endTime <- tt + 24 * 60 * 60
y <- xx[paste(startTime, endTime, sep = "/")]
if (nrow(y[!is.na(y$X1), "X1"]) != 0) {
return(as.vector(y[!is.na(y$X1), "X1"]))
} else {
return(NA)
}
})
xx[!is.na(xx$Y), "X1"]
## X1
## 2010-03-02 00:10:00 0.9
## 2010-03-04 09:00:00 0.6
## 2013-01-03 08:49:00 NA
## 2013-01-03 20:52:00 NA
## 2013-02-03 21:14:00 1.0
## 2013-02-16 04:12:00 0.2
## 2013-02-16 21:02:00 NA
this is how my data looks like:
df <- structure(list(`1` = c(1 , 2 , 3 , 4 , 5 ,6 , 7 , 8 , 9, 10 ,11 ,12, 13, 14 ,15 ,16, 17, 18), `2` = structure(c(4L,5L, 2L, 5L, 2L, 3L, 1L, 6L,4L,5L, 2L, 5L, 2L, 3L, 1L, 6L,4L,5L), .Label=c("a","a","b","b","b","c","c","b","b","b","e","e","f","g","g","g","f","f"),
class="factor"),`3`=c(1,0,1,1,0,1,0,1,1,0,1,0,1,1,0,1,0,0),`4`=c(0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,1), `5` =c(10,5,20,20,5,0,0,10,10,5,10,5,15,5,5,5,2,2)),
.Names = c("N", "Condition", "AOI_hit_b", "AOI_hit_f", "Time"), class = "data.frame",
row.names = c("1", "2", "3", "4", "5", "6", "7", "8", "9","10","11","12","13","14","15","16","17","18"))
I now want to make calculations on Time, depending on whether Condition b is preceded by a or c - depending on wheter AOI_hit_b is a 0 or 1. To that comes that only the first hit of 1 in a Condition is relevant for time, as it always writes the same time for multiple 1's in a condition.
I tried it with the ddply package, but didn't get the output I wanted.
My output should look something like this:
sum of
Condition Time
b(a) 25
b(c) 10
This is the code I have so far:
hit_cb = list()
for (i in 1:nrow(df)){
if (df[i,2] == "b") & (df[i-1,2] == "c") {
hit_cb[i] = ddply(df,.(AOI_hit_b), summarize, mysum=sum(unique(Time)))
}
}