The task is to efficiently extract events from this data:
data <- structure(
list(i = c(1, 1, 1, 2, 2, 2), t = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("i", "t", "x"), row.names = c(NA, -6L), class = "data.frame"
)
> data
i t x
1 1 1 1
2 1 2 1
3 1 3 2
4 2 1 1
5 2 3 2
6 2 4 3
Let's call i facts, t is time, and x is the number of selections of i at t.
An event is an uninterrupted sequence of selections of one fact. Fact 1 is selected all throughout t=1 to t=3 with a sum of 4 selections. But fact 2 is split into two events, the first from t=1 to t=1 (sum=1) and the second from t=3 to t=4 (sum=5). Therefore, the event data frame is supposed to look like this:
> event
i from to sum
1 1 1 3 4
2 2 1 1 1
3 2 3 4 5
This code does what is needed:
event <- structure(
list(i = logical(0), from = logical(0), to = logical(0), sum = logical(0)),
.Names = c("i", "from", "to", "sum"), row.names = integer(0),
class = "data.frame"
)
l <- nrow(data) # get rows of data frame
c <- 1 # set counter
d <- 1 # set initial row of data to start with
e <- 1 # set initial row of event to fill
repeat{
event[e,1] <- data[d,1] # store "i" in event data frame
event[e,2] <- data[d,2] # store "from" in event data frame
while((data[d+1,1] == data[d,1]) & (data[d+1,2] == data[d,2]+1)){
c <- c+1
d <- d+1
if(d >= l) break
}
event[e,3] <- data[d,2] # store "to" in event data frame
event[e,4] <- sum(data[(d-c+1):d,3]) # store "sum" in event data frame
c <- 1
d <- d+1
e <- e+1
}
The problem is that this code takes 3 days to extract the events from a data frame with 1 million rows and my data frame has 5 million rows.
How can I make this more efficient?
P.S.: There's also a minor bug in my code related to termination.
P.P.S.: The data is sorted first by i, then by t.
can you try if this dplyr implementation is faster?
library(dplyr)
data <- structure(
list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)
group_by(data, fact) %>%
mutate(fromto=cumsum(c(0, diff(timing) > 1))) %>%
group_by(fact, fromto) %>%
summarize(from=min(timing), to=max(timing), sumx=sum(x)) %>%
select(-fromto) %>%
ungroup()
how about this data.table implementation?
library(data.table)
data <- structure(
list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)
setDT(data)[, fromto:=cumsum(c(0, diff(timing) > 1)), by=fact]
event <- data[, .(from=min(timing), to=max(timing), sumx=sum(x)), by=c("fact", "fromto")][,fromto:=NULL]
##results when i enter event in the R console and my data.table package version is data.table_1.9.6
> event
fact from to sumx
1: 1 1 3 4
2: 2 1 1 1
3: 2 3 4 5
> str(event)
Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables:
$ fact: num 1 2 2
$ from: num 1 1 3
$ to : num 3 1 4
$ sumx: num 4 1 5
- attr(*, ".internal.selfref")=<externalptr>
> dput(event)
structure(list(fact = c(1, 2, 2), from = c(1, 1, 3), to = c(3,
1, 4), sumx = c(4, 1, 5)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .Names = c("fact", "from", "to", "sumx"), .internal.selfref = <pointer: 0x0000000000120788>)
Reference
detect intervals of the consequent integer sequences
Assuming the data frame is sorted according to data$t, you can try something like this
event <- NULL
for (i in unique(data$i)) {
x <- data[data$i == i, ]
ev <- cumsum(c(1, diff(x$t)) > 1)
smry <- lapply(split(x, ev), function(z) c(i, range(z$t), sum(z$x)))
event <- c(event, smry)
}
event <- do.call(rbind, event)
rownames(event) <- NULL
colnames(event) <- c('i', 'from', 'to', 'sum')
The result is a matrix, not a data frame.
Related
I have these two columns in my data.frame :
df1 <- structure(list(Mode = c("car", "walk", "passenger", "car", "bus"
), Licence = c(1, 1, 0, 1, 1)), row.names = c(NA, -5L), class = "data.frame")
df1
# Mode Licence
# 1 car 1
# 2 walk 1
# 3 passenger 0
# 4 car 1
# 5 bus 1
I want to make an indicator vector b, that is 1 if the mode of that person is not car an have a driver licence and 0 otherwise. in the above example I need d to be:
df2 <- structure(list(Mode = c("car", "walk", "passenger", "car", "bus"
), Licence = c(1, 1, 0, 1, 1), b = c(0, 1, 0, 0, 1)), row.names = c(NA,
-5L), class = "data.frame")
df2
# Mode Licence b
# 1 car 1 0
# 2 walk 1 1
# 3 passenger 0 0
# 4 car 1 0
# 5 bus 1 1
Here you go. You could use "ifelse" statements for this as its easier to understand.
data = data.frame(mode = c("car", "walk", "passanger", "car", "bus"), License = c(1,1,0,1,1))
data$b = ifelse(data$mode !="car" & data$License == 1, 1,0)
Another solution using logical operations and implicit conversion between numeric and logical:
df1$b <- with(df1, Mode!="car" & Licence)*1
Note: 0 is equivalent to FALSE and everything else is equivalent to TRUE, so if the possible values are just 0 and 1, we can shorten Licence == 1 to just Licence. The *1 part at he end converts truth values to 0's and 1's again.
Another solution with dplyr:
library(dplyr)
df1 %>% mutate(b = if_else(Mode %in% c('walk', 'bus')&Licence == 1, # condition
true = 1,
false = 0))
Sample data.frame:
structure(list(a = c(1, 2, 3), b = c(4, 5, 6), c = c(7, 8, 9)), .Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
Output:
df
# a b c
# 1 1 4 7
# 2 2 5 8
# 3 3 6 9
I'd like to get the first and third columns, but I want to subset by name and also by column index.
df[, "a"]
# [1] 1 2 3
df[, 3]
# [1] 7 8 9
df[, c("a", 3)]
# Error in `[.data.frame`(df, , c("a", 3)) : undefined columns selected
df[, c(match("a", names(df)), 3)]
# a c
# 1 1 7
# 2 2 8
# 3 3 9
Are there functions or packages that allow for clean/simple syntax, as in the third example, while also achieving the result of the fourth example?
Maybe use dplyr?
For interactive use - i.e., if you know ahead of time the name of the column you want to select
library(dplyr)
df %>% select(a, 3)
If you do not know the name of the column in advance, and want to pass it as a variable,
x <- names(df)[1]
x
[1] "a"
df %>% select_(x, 3)
Either way the output is
# a c
#1 1 7
#2 2 8
#3 3 9
In base R you can combine subset with select.
df <- structure(list(a = c(1, 2, 3),
b = c(4, 5, 6), c = c(7, 8, 9)),
.Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
df <- subset(df, select = c(a, 3))
You can index names(df) without using dplyr:
df <- structure(list(a = c(1, 2, 3), b = c(4, 5, 6), c = c(7, 8, 9)), .Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
df[,c("a",names(df)[3]) ]
Output:
a c
1 1 7
2 2 8
3 3 9
I have two data frames that display a month and a list of ids in each row. They look like this:
dataframe A:
Month ID
2016-03 1,2,3
2016-04 4,5,6
2016-05 7,8,9
dataframe B:
Month ID
2016-03 2,3,4
2016-04 5,6,7
2016-05 8,9,10
Seems simple, and perhaps I'm overthinking it, but I'm having trouble subtracting the corresponding rows from dataframe B from dataframe A.
Ultimate goal is to get the count of ids per row from dataframe A after dataframe B is removed.
So the resulting dataframe would look like:
Month ID
2016-03 1
2016-04 4
2016-05 7
and my count would be 1, 1, 1.
Thanks in advance for the help!
Update:
The values in the "ID" column are list objects like:
c("1", "2", "3")
Use setdiff once you have appropriate vectors for each Month:
result <- Map(setdiff, A$ID, B$ID[match(A$Month,B$Month)] ))
#[[1]]
#[1] 1
#
#[[2]]
#[1] 4
#
#[[3]]
#[1] 7
If you need the lengths you can easily do:
lengths(result)
#[1] 1 1 1
Where, the data used was:
A <- structure(list(Month = c("2016-03", "2016-04", "2016-05"), ID = list(
c(1, 2, 3), c(4, 5, 6), c(7, 8, 9))), .Names = c("Month",
"ID"), row.names = c(NA, -3L), class = "data.frame")
B <- structure(list(Month = c("2016-03", "2016-04", "2016-05"), ID = list(
c(2, 3, 4), c(5, 6, 7), c(8, 9, 10))), .Names = c("Month",
"ID"), row.names = c(NA, -3L), class = "data.frame")
Given a data frame
df=data.frame(
E=c(1,1,2,1,3,2,2),
N=c(4,4,10,4,3,2,2)
)
I would like to create a third column: Every time a value equals another value in the same column and these rows are also equal in the other column it results in a match (new character for every match).
dfx=data.frame(
E=c(1,1,2,1,3,2,2,3, 2),
N=c(4,4,10,4,3,2,2,6, 10),
matched=c("A", "A", "B","A", NA, "C", "C", NA, "B")
)
Thanks!
Here, df is:
df <- structure(list(E = c(1, 1, 2, 1, 3, 2, 2, 3, 2), N = c(4, 4,
10, 4, 3, 2, 2, 6, 10)), .Names = c("E", "N"), row.names = c(NA,
-9L), class = "data.frame")
You can do:
dfx <- transform(df, matched = {
i <- as.character(interaction(df[c("E", "N")]))
tab <- table(i)[order(unique(i))]
LETTERS[match(i, names(tab)[tab > 1])]
})
# E N matched
# 1 1 4 A
# 2 1 4 A
# 3 2 10 B
# 4 1 4 A
# 5 3 3 <NA>
# 6 2 2 C
# 7 2 2 C
# 8 3 6 <NA>
# 9 2 10 B
I have a dataframe "forum" that basically looks like this:
post-id: 1, 2, 3, 4, 5, ...
user-id: 1, 1, 2, 3, 4, ...
subforum-id: 1, 1, 1, 2, 3, ...
Now I'm trying to create a new dataframe that looks like this:
subforum-id: 1, 2, 3, ...
number-of-users-that-posted-only-once-to-this-subforum: ...
number-of-users-that-posted-more-than-n-times-to-this-subforum: ...
Is there any way to do that without pre-fabricating all the counts?
Using plyr and summarise:
# N = 1 here
ddply(DF, .(subforum.id), summarise, once = sum(table(user.id) == 1),
n.times = sum(table(user.id) > N))
# subforum.id once n.times
# 1 1 1 1
# 2 2 1 0
# 3 3 1 0
This is the data.frame DF:
DF <- structure(list(post.id = 1:5, user.id = c(1, 1, 2, 3, 4),
subforum.id = c(1, 1, 1, 2, 3)),
.Names = c("post.id", "user.id", "subforum.id"),
row.names = c(NA, -5L), class = "data.frame")
Here's a basic idea to get you started: Use table to get a count of user ids by subforum ids and work from there:
> mydf <- structure(list(post.id = c(1, 2, 3, 4, 5), user.id = c(1, 1,
2, 3, 4), subforum.id = c(1, 1, 1, 2, 3)), .Names = c("post.id",
"user.id", "subforum.id"), row.names = c(NA, -5L), class = "data.frame")
> mytable <- with(mydf, table(subforum.id, user.id))
> mytable
user.id
subforum.id 1 2 3 4
1 2 1 0 0
2 0 0 1 0
3 0 0 0 1
Hint: from there, look at the rowSums function, and think about what happens if you sum over a logical vector.