Subset R data.frame by index and name in one line - r

Sample data.frame:
structure(list(a = c(1, 2, 3), b = c(4, 5, 6), c = c(7, 8, 9)), .Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
Output:
df
# a b c
# 1 1 4 7
# 2 2 5 8
# 3 3 6 9
I'd like to get the first and third columns, but I want to subset by name and also by column index.
df[, "a"]
# [1] 1 2 3
df[, 3]
# [1] 7 8 9
df[, c("a", 3)]
# Error in `[.data.frame`(df, , c("a", 3)) : undefined columns selected
df[, c(match("a", names(df)), 3)]
# a c
# 1 1 7
# 2 2 8
# 3 3 9
Are there functions or packages that allow for clean/simple syntax, as in the third example, while also achieving the result of the fourth example?

Maybe use dplyr?
For interactive use - i.e., if you know ahead of time the name of the column you want to select
library(dplyr)
df %>% select(a, 3)
If you do not know the name of the column in advance, and want to pass it as a variable,
x <- names(df)[1]
x
[1] "a"
df %>% select_(x, 3)
Either way the output is
# a c
#1 1 7
#2 2 8
#3 3 9

In base R you can combine subset with select.
df <- structure(list(a = c(1, 2, 3),
b = c(4, 5, 6), c = c(7, 8, 9)),
.Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
df <- subset(df, select = c(a, 3))

You can index names(df) without using dplyr:
df <- structure(list(a = c(1, 2, 3), b = c(4, 5, 6), c = c(7, 8, 9)), .Names = c("a", "b", "c"), row.names = c(NA, -3L), class = "data.frame")
df[,c("a",names(df)[3]) ]
Output:
a c
1 1 7
2 2 8
3 3 9

Related

How to add new column and calculate recursive cum using dplyr and shift

I have a dataset: (actually I have more than 100 groups)
and I want to use dplyr to create a variable-y for each group, and fill first value of y to be 1,
Second y = 1* first x + 2*first y
The result would be:
I tried to create a column- y, all=1, then use
df%>% group_by(group)%>% mutate(var=shift(x)+2*shift(y))%>% ungroup()
but the formula for y become, always use initialize y value--1
Second y = 1* first x + 2*1
Could someone give me some ideas about this? Thank you!
The dput of my result data is:
structure(list(group = c("a", "a", "a", "a", "a", "b", "b", "b" ), x =
c(1, 2, 3, 4, 5, 6, 7, 8), y = c(1, 3, 8, 19, 42, 1, 8, 23)),
row.names = c(NA, -8L), class = c("tbl_df", "tbl", "data.frame" ))
To perform such calculation we can use accumulate from purrr or Reduce in base R.
Since you are already using dplyr we can use accumulate :
library(dplyr)
df %>%
group_by(group) %>%
mutate(y1 = purrr::accumulate(x[-n()], ~.x * 2 + .y, .init = 1))
# group x y y1
# <chr> <dbl> <dbl> <dbl>
#1 a 1 1 1
#2 a 2 3 3
#3 a 3 8 8
#4 a 4 19 19
#5 a 5 42 42
#6 b 6 1 1
#7 b 7 8 8
#8 b 8 23 23

Recode variable values into strings based on different variable

I have two datasets:
df1:
structure(list(v1 = c(1, 4, 3, 7, 8, 1, 2, 4)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
df2:
structure(list(val = c(1, 2, 3, 4, 5, 6, 7, 8, 9), lab = c("a",
"b", "c", "d", "e", "f", "g", "h", "i")), row.names = c(NA, -9L
), class = c("tbl_df", "tbl", "data.frame"))
I want to recode v1 in df1 according to the values (val) and labels (lab) in df2.
Following this, my output would should look like this:
df3:
structure(list(v1 = c("a", "d", "c", "g", "h", "a", "b", "d")), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
Is there any package or function I am missing which could easily solve this problem? The problem itself looks quite easy to me but I found no simple solution. Of course, writing a for loop would be always possible but it would make this operation probably too complicated as I want to do this many times with big datasets.
An option using dplyr which will keep the original order
library(dplyr)
new_df <- df1 %>%
transmute(v1 = left_join(df1, df2, by = c("v1" = "val"))$lab)
# v1
# <chr>
#1 a
#2 d
#3 c
#4 g
#5 h
#6 a
#7 b
#8 d
identical(new_df, df3)
#[1] TRUE
Another base option is using merge, this will not keep the order
df1$v1 <- merge(df1, df2, all.x = TRUE, by.x = "v1", by.y = "val")$lab
# v1
# <chr>
#1 a
#2 a
#3 b
#4 c
#5 d
#6 d
#7 g
#8 h
Below is a simple solution:
X<-as.data.frame(df1)
Y<-as.data.frame(df2)
final_df <- merge(X, Y, all.x = TRUE, by.x = "v1", by.y = "val")
print(final_df)
output
v1 lab
1 1 a
2 1 a
3 2 b
4 3 c
5 4 d
6 4 d
7 7 g
8 8 h
This will not keep the order, but below approach using the dplyr will keep the order also.
library(dplyr)
X<-as.data.frame(df1)
Y<-as.data.frame(df2)
final_df <- X %>%
transmute(v1 = left_join(X, Y, by = c("v1" = "val"))$lab)
print(final_df)
output
v1
1 a
2 d
3 c
4 g
5 h
6 a
7 b
8 d
I hope this helps

Subset of dataframe for which 2 variables match another dataframe in R

I'm looking to obtain a subset of my first, larger, dataframe 'df1' by selecting rows which contain particular combinations in the first two variables, as specified in a smaller 'df2'. For example:
df1 <- data.frame(ID = c("A", "A", "A", "B", "B", "B"),
day = c(1, 2, 2, 1, 2, 3), value = seq(4,9))
df1 # my actual df has 20 varables
ID day value
A 1 4
A 2 5
A 2 6
B 1 7
B 2 8
B 3 9
df2 <- data.frame(ID = c("A", "B"), day = c(2, 1))
df2 # this df remains at 2 variables
ID day
A 2
B 1
Where the output would be:
ID day value
A 2 5
A 2 6
B 1 7
Any help wouldbe much appreciated, thanks!
This is a good use of the merge function.
df1 <- data.frame(ID = c("A", "A", "A", "B", "B", "B"),
day = c(1, 2, 2, 1, 2, 3), value = seq(4,9))
df2 <- data.frame(ID = c("A", "B"), day = c(2, 1))
merge(df1,
df2,
by = c("ID", "day"))
Which gives output:
ID day value
1 A 2 5
2 A 2 6
3 B 1 7
Here is a dplyr solution:
library("dplyr")
semi_join(df1, df2, by = c("ID", "day"))
# ID day value
# 1 A 2 5
# 2 A 2 6
# 3 B 1 7

R: Efficient extraction of events (continuous increase in a variable)

The task is to efficiently extract events from this data:
data <- structure(
list(i = c(1, 1, 1, 2, 2, 2), t = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("i", "t", "x"), row.names = c(NA, -6L), class = "data.frame"
)
> data
i t x
1 1 1 1
2 1 2 1
3 1 3 2
4 2 1 1
5 2 3 2
6 2 4 3
Let's call i facts, t is time, and x is the number of selections of i at t.
An event is an uninterrupted sequence of selections of one fact. Fact 1 is selected all throughout t=1 to t=3 with a sum of 4 selections. But fact 2 is split into two events, the first from t=1 to t=1 (sum=1) and the second from t=3 to t=4 (sum=5). Therefore, the event data frame is supposed to look like this:
> event
i from to sum
1 1 1 3 4
2 2 1 1 1
3 2 3 4 5
This code does what is needed:
event <- structure(
list(i = logical(0), from = logical(0), to = logical(0), sum = logical(0)),
.Names = c("i", "from", "to", "sum"), row.names = integer(0),
class = "data.frame"
)
l <- nrow(data) # get rows of data frame
c <- 1 # set counter
d <- 1 # set initial row of data to start with
e <- 1 # set initial row of event to fill
repeat{
event[e,1] <- data[d,1] # store "i" in event data frame
event[e,2] <- data[d,2] # store "from" in event data frame
while((data[d+1,1] == data[d,1]) & (data[d+1,2] == data[d,2]+1)){
c <- c+1
d <- d+1
if(d >= l) break
}
event[e,3] <- data[d,2] # store "to" in event data frame
event[e,4] <- sum(data[(d-c+1):d,3]) # store "sum" in event data frame
c <- 1
d <- d+1
e <- e+1
}
The problem is that this code takes 3 days to extract the events from a data frame with 1 million rows and my data frame has 5 million rows.
How can I make this more efficient?
P.S.: There's also a minor bug in my code related to termination.
P.P.S.: The data is sorted first by i, then by t.
can you try if this dplyr implementation is faster?
library(dplyr)
data <- structure(
list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)
group_by(data, fact) %>%
mutate(fromto=cumsum(c(0, diff(timing) > 1))) %>%
group_by(fact, fromto) %>%
summarize(from=min(timing), to=max(timing), sumx=sum(x)) %>%
select(-fromto) %>%
ungroup()
how about this data.table implementation?
library(data.table)
data <- structure(
list(fact = c(1, 1, 1, 2, 2, 2), timing = c(1, 2, 3, 1, 3, 4), x = c(1, 1, 2, 1, 2, 3)),
.Names = c("fact", "timing", "x"), row.names = c(NA, -6L), class = "data.frame"
)
setDT(data)[, fromto:=cumsum(c(0, diff(timing) > 1)), by=fact]
event <- data[, .(from=min(timing), to=max(timing), sumx=sum(x)), by=c("fact", "fromto")][,fromto:=NULL]
##results when i enter event in the R console and my data.table package version is data.table_1.9.6
> event
fact from to sumx
1: 1 1 3 4
2: 2 1 1 1
3: 2 3 4 5
> str(event)
Classes ‘data.table’ and 'data.frame': 3 obs. of 4 variables:
$ fact: num 1 2 2
$ from: num 1 1 3
$ to : num 3 1 4
$ sumx: num 4 1 5
- attr(*, ".internal.selfref")=<externalptr>
> dput(event)
structure(list(fact = c(1, 2, 2), from = c(1, 1, 3), to = c(3,
1, 4), sumx = c(4, 1, 5)), row.names = c(NA, -3L), class = c("data.table",
"data.frame"), .Names = c("fact", "from", "to", "sumx"), .internal.selfref = <pointer: 0x0000000000120788>)
Reference
detect intervals of the consequent integer sequences
Assuming the data frame is sorted according to data$t, you can try something like this
event <- NULL
for (i in unique(data$i)) {
x <- data[data$i == i, ]
ev <- cumsum(c(1, diff(x$t)) > 1)
smry <- lapply(split(x, ev), function(z) c(i, range(z$t), sum(z$x)))
event <- c(event, smry)
}
event <- do.call(rbind, event)
rownames(event) <- NULL
colnames(event) <- c('i', 'from', 'to', 'sum')
The result is a matrix, not a data frame.

match rows across two columns

Given a data frame
df=data.frame(
E=c(1,1,2,1,3,2,2),
N=c(4,4,10,4,3,2,2)
)
I would like to create a third column: Every time a value equals another value in the same column and these rows are also equal in the other column it results in a match (new character for every match).
dfx=data.frame(
E=c(1,1,2,1,3,2,2,3, 2),
N=c(4,4,10,4,3,2,2,6, 10),
matched=c("A", "A", "B","A", NA, "C", "C", NA, "B")
)
Thanks!
Here, df is:
df <- structure(list(E = c(1, 1, 2, 1, 3, 2, 2, 3, 2), N = c(4, 4,
10, 4, 3, 2, 2, 6, 10)), .Names = c("E", "N"), row.names = c(NA,
-9L), class = "data.frame")
You can do:
dfx <- transform(df, matched = {
i <- as.character(interaction(df[c("E", "N")]))
tab <- table(i)[order(unique(i))]
LETTERS[match(i, names(tab)[tab > 1])]
})
# E N matched
# 1 1 4 A
# 2 1 4 A
# 3 2 10 B
# 4 1 4 A
# 5 3 3 <NA>
# 6 2 2 C
# 7 2 2 C
# 8 3 6 <NA>
# 9 2 10 B

Resources