R which rows in a dataframe exist in another dataframe - r

I have two dataframes A and B:
A
x y
1 0.0 0.0000000
2 0.5 0.8000000
3 -0.5 0.8000000
4 -1.0 0.0000000
5 -0.5 -0.8000000
6 0.5 -0.8000000
7 1.0 0.0000000
8 1.5 0.8000000
B
x y
1 -1.0 0.0000000
2 0.5 -0.8000000
3 3.0 0.0000000
I want to extract just the row indexes in A that exist in B so that the final result will be:
c(4,6)
How should I go about doing this?

interaction could be used to use %in% on multiple columns.
which(interaction(A) %in% interaction(B))
#[1] 4 6
Data
A <- read.table(header=TRUE, text=" x y
1 0.0 0.0000000
2 0.5 0.8000000
3 -0.5 0.8000000
4 -1.0 0.0000000
5 -0.5 -0.8000000
6 0.5 -0.8000000
7 1.0 0.0000000
8 1.5 0.8000000")
B <- read.table(header=TRUE, text=" x y
1 -1.0 0.0000000
2 0.5 -0.8000000
3 3.0 0.0000000")

Add another column to A which is just a sequence and then merge
> A$c=1:nrow(A)
> merge(A,B)$c
[1] 4 6

Using the join.keys function from plyr:
library(plyr)
with(join.keys(A, B), which(x %in% y))
Output:
[1] 4 6

One possible way is to count the number of times TRUE appears twice. If the columns get wider you can map over them.
which(`+`(df1$x %in% df2$x, df1$y %in% df2$y) == 2)
4 6

Using outer
which(rowSums(outer(1:nrow(A), 1:nrow(B), Vectorize(\(i, j) all(A[i, ] == B[j, ])))) == 1)
# [1] 4 6
or a for loop
r <- c()
for (j in seq_len(nrow(B))) {
for (i in seq_len(nrow(A))) {
if (all(A[i, ] == B[j, ])) r <- c(r, i)
}
}
r
# [1] 4 6

library(data.table)
setDT(A)
setDT(B)
A[fintersect(A, B), on = names(A), which = TRUE]
# [1] 4 6
library(dplyr)
A |>
mutate(row = row_number()) |>
inner_join(B, by = names(A)) |>
pull(row)
# [1] 4 6
Data
A = data.frame(
x = c(0, 0.5, -0.5, -1, -0.5, 0.5, 1, 1.5),
y = c(0, 0.8, 0.8, 0, -0.8, -0.8, 0, 0.8))
B = data.frame(
x = c(-1, 0.5, 3), y = c(0, -0.8, 0)
)

Related

Rounding of significant figures if less than 1 in R

I would like to create a function in R that rounds numeric dataframes (or columns in a dataframe) depending on the number. If the number is less than 1, round to 1 decimal, but if it is greater than 1, round to 0 decimals.
This is what I have
data <- data.frame(x = c(1.111, 0.809, 5.55555, 0.567), y = c(0.235, 0.777, 4.55555555, 393.55))
round0 <- function(x) format(round(x, digits=0), nsmall = 0, trim = TRUE)
round0(data)
x y
1 1 0
2 1 1
3 6 5
4 1 394
# What I want
x y
1 1 0.2
2 0.8 1
3 6 5
4 1 394
> round0 <- function(x) ifelse(x<1,round(x,1),round(x))
> sapply(data,round0)
x y
[1,] 1.0 0.2
[2,] 0.8 0.8
[3,] 6.0 5.0
[4,] 0.6 394.0
You can use :
round0 <- function(x) ifelse(x < 1, format(round(x, 1), nsmall = 1), round(x))
data[] <- lapply(data, round0)
data
# x y
#1 1 0.2
#2 0.8 0.8
#3 6 5
#4 0.6 394
Note that this is only for display purpose and classes of columns are of type character. If you want to perform any mathematical calculation on it you need to convert it back to numeric.

A function to manipulate data.frames in R

I'm trying to come up with a function that does the following to a data.frame outputting a new data.frame with the same names:
1- Creates a seq(min(target), max(target), .1).
2- Takes the mean of all other variables.
For example, if q is our data.frame, and jen is the target in it, I want to reformat q such that jen's data becomes seq(min(jen), max(jen), .1), and both bob and joe just change to their mean values.
Is it possible to do this in R?
I tried something but it is far from being accurate.
q = data.frame(bob = 1:5 - 3, jen = c(1.7, 2.6, 2.5, 4.4, 3.8) - 3, joe = 5:9)
change <- function(dataframe = q, target = "jen"){
n <- names(dataframe)
dataframe[target] <- seq(from = min(target), max(target), .1)
}
A base R solution. My idea is to create the target column first in the function, and then use a for-loop to add the mean of other columns.
# Example data frame
q <- data.frame(bob = 1:5 - 3, jen = c(1.7, 2.6, 2.5, 4.4, 3.8) - 3, joe = 5:9)
# Create then function
change <- function(dat, target){
vec <- dat[, target]
target_new <- seq(min(vec), max(vec), by = 0.1)
dat2 <- data.frame(target_new)
names(dat2) <- target
for (i in names(dat)[!names(dat) %in% target]){
dat2[[i]] <- mean(dat[[i]])
}
dat2 <- dat2[, names(dat)]
return(dat2)
}
# Apply the function
change(q, "jen")
# bob jen joe
# 1 0 -1.3 7
# 2 0 -1.2 7
# 3 0 -1.1 7
# 4 0 -1.0 7
# 5 0 -0.9 7
# 6 0 -0.8 7
# 7 0 -0.7 7
# 8 0 -0.6 7
# 9 0 -0.5 7
# 10 0 -0.4 7
# 11 0 -0.3 7
# 12 0 -0.2 7
# 13 0 -0.1 7
# 14 0 0.0 7
# 15 0 0.1 7
# 16 0 0.2 7
# 17 0 0.3 7
# 18 0 0.4 7
# 19 0 0.5 7
# 20 0 0.6 7
# 21 0 0.7 7
# 22 0 0.8 7
# 23 0 0.9 7
# 24 0 1.0 7
# 25 0 1.1 7
# 26 0 1.2 7
# 27 0 1.3 7
# 28 0 1.4 7
Here is one option with base R
data.frame(Map(function(x, y) if(x=="mean") get(x)(y) else
get(x)(min(y), max(y), by = 0.1), setNames(c("mean", "seq", "mean"), names(q)), q))
Or with dplyr
library(dplyr)
q %>%
summarise(bob = mean(bob),
jen = list(seq(min(jen), max(jen), by = 0.1)),
joe = mean(joe)) %>%
unnest
Or if there are many columns to get the mean and only a single column sequence, then instead of specifying one by one
q %>%
mutate_at(c(1,3), mean) %>%
group_by(bob, joe) %>%
summarise(jen = list(seq(min(jen), max(jen), by = 0.1))) %>%
unnest
Or use complete
q %>%
group_by(bob = mean(bob), joe = mean(joe)) %>%
complete(jen = seq(min(jen), max(jen), by = .1))
My solution uses colMeans function and repeats the result as many times as the sequence is long. Then I replace the target column with the sequence results.
q = data.frame(bob = 1:5 - 3, jen = c(1.7, 2.6, 2.5, 4.4, 3.8) - 3, joe = 5:9)
manip <- function(target, df){
t.column <- which(colnames(df) == target)
dfmeans <- colMeans(df)
minmax <- range(df[,t.column],na.rm = T)
t.seq <- seq(minmax[1],minmax[2],.1)
newdf <- matrix(dfmeans, ncol = length(dfmeans))[rep(1, length(t.seq)),]
newdf[,t.column] <- t.seq
colnames(newdf) <- colnames(df)
return(as.data.frame(newdf))
}
manip("jen",q)

R: Is it possible to use mutate+lag with the same column?

I'm trying to replicate the following formula in R:
Xt = Xt-1 * b + Zt * (1-b)
I'm using the following code
t %>%
mutate(x= ifelse(week == 1, z, NaN)) %>% # Initial value for the first lag
mutate(x= ifelse(week != 1, lag(x,1 ,default = 0) * b + z, z)
But I get all NaN except from the second element.
z b x
(dbl) (dbl) (dbl)
1 168.895 0.9 168.8950
2 20.304 0.9 131.7472
3 14.943 0.9 NA
4 11.028 0.9 NA
5 8.295 0.9 NA
6 8.024 0.9 NA
7 6.872 0.9 NA
8 7.035 0.9 NA
9 4.399 0.9 NA
10 4.158 0.9 NA
This is fairly simple in excel but I must do it in R, Do you have any approaches?
Reproducible example:
set.seed(2)
t = data.frame(week = seq(1:52),
z = runif(52, 0, 100),
b = 0.2)
I found the solution running the following loop, thanks to #Frank and #docendo discimus
for (row in 2:dim(t)[1]) {
t[row,] <- mutate(t[1:row,], x= lag(x,1) * b + z * (1 - b))[row,]
}

R help: divide values by sum produced through factor

I am trying to to divide each value in columns B and C by the sum due to a factor in column A.
The starting matrix could look something like this but has thousands of rows
where A is a factor, and B and C contain the values:
A <- c(1,1,2,2)
B <- c(0.2, 0.3, 1, 0.5)
C <- c(0.7, 0.5, 0, 0.9)
M <- data.table(A,B,C)
> M
A B C
[1,] 1 0.2 0.7
[2,] 1 0.3 0.5
[3,] 2 1.0 0.0
[4,] 2 0.5 0.9
The factors can occur any number of times.
I was able to produce the sum per factor with library data.table:
library(data.table)
M.dt <- data.table(M)
M.sum <- M.dt[, lapply(.SD, sum), by = A]
> M.sum
A B C
1: 1 0.5 1.2
2: 2 1.5 0.9
but didn't know how to go on from here to keep the original format of the table.
The resulting table should look like this:
B.1 <- c(0.4, 0.6, 0.666, 0.333)
C.1 <- c(0.583, 0.416, 0, 1)
M.1 <- cbind(A, B.1, C.1)
> M.1
A B.1 C.1
[1,] 1 0.400 0.58333
[2,] 1 0.600 0.41666
[3,] 2 0.666 0.00000
[4,] 2 0.333 1.00000
The calculation for the first value in B.1 would go like this:
0.2/(0.2+0.3) = 0.4 and so on, where the values to add are given by the factor in A.
I have some basic knowledge of R, but despite trying hard, I do badly with matrix manipulations and loops.
Simply divide each value in each column by its sum per each value in A
M[, lapply(.SD, function(x) x/sum(x)), A]
# A B C
# 1: 1 0.4000000 0.5833333
# 2: 1 0.6000000 0.4166667
# 3: 2 0.6666667 0.0000000
# 4: 2 0.3333333 1.0000000
If you want to update by reference do
M[, c("B", "C") := lapply(.SD, function(x) x/sum(x)), A]
Or more generally
M[, names(M)[-1] := lapply(.SD, function(x) x/sum(x)), A]
A bonus solution for the dplyr junkies
library(dplyr)
M %>%
group_by(A) %>%
mutate_each(funs(./sum(.)))
# Source: local data table [4 x 3]
# Groups: A
#
# A B C
# 1 1 0.4000000 0.5833333
# 2 1 0.6000000 0.4166667
# 3 2 0.6666667 0.0000000
# 4 2 0.3333333 1.0000000
Like most problems of this type, you can either use data.table or plyr package or some combination of split, apply, combine functions in base R.
For those who prefer the plyr package
library (plyr)
M <- data.table(A,B,C)
ddply(M, .(A), colwise(function(x) x/sum(x)))
Output is:
A B C
1 1 0.4000000 0.5833333
2 1 0.6000000 0.4166667
3 2 0.6666667 0.0000000
4 2 0.3333333 1.0000000

Better way to produce data frame using table()

Recently, I have found that I am using the following pattern over and over again. The process is:
cross-tabulate numeric variable by factor using table
create data frame from created table
add original numeric values to data frame (from row names (!))
remove row names
reorder columns of aggregated data frame
In R, it looks like this:
# Sample data
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
# Get frequencies
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
# Add column of original values from rownames
dfSummary$x <- as.numeric(rownames(dfSummary))
# Remove rownames
rownames(dfSummary) <- NULL
# Reorder columns
dfSummary <- dfSummary[, c(3, 1, 2)]
Is there anything more elegant in R, preferably using base functions? I know I can use sql to do this in single command - I think that it has to be possible to achieve similar behavior in R.
sqldf solution:
library(sqldf)
dfSummary <- sqldf("select
x,
sum(y = 'failure') as failure,
sum(y = 'success') as success
from df group by x")
An alternative with base R could be:
aggregate(. ~ x, transform(df, success = y == "sucess",
failure = y == "failure", y = NULL), sum)
# x success failure
#1 0.0 2 4
#2 0.1 6 8
#3 0.2 1 7
#4 0.3 5 4
#5 0.4 6 6
#6 0.5 3 3
#7 0.6 4 6
#8 0.7 6 6
#9 0.8 4 5
#10 0.9 6 7
#11 1.0 1 0
Your code modified as a function would be efficient compared to the other solutions in base R (so far). If you wanted the code in one-line, a "reshape/table" combo from base R could be used.
reshape(as.data.frame(table(df)), idvar='x', timevar='y',
direction='wide')
# x Freq.failure Freq.success
#1 0 3 2
#2 0.1 3 9
#3 0.2 5 5
#4 0.3 8 7
#5 0.4 5 3
#6 0.5 9 4
#7 0.6 3 6
#8 0.7 7 6
#9 0.8 3 1
#10 0.9 4 3
#11 1 0 4
In case you want to try data.table
library(data.table)
dcast.data.table(setDT(df), x~y)
# x failure success
# 1: 0.0 3 2
# 2: 0.1 3 9
# 3: 0.2 5 5
# 4: 0.3 8 7
# 5: 0.4 5 3
# 6: 0.5 9 4
# 7: 0.6 3 6
# 8: 0.7 7 6
# 9: 0.8 3 1
#10: 0.9 4 3
#11: 1.0 0 4
Update
I didn't notice the as.data.frame(table( converts to "factor" columns (thanks to #Hadley's comment). A workaround is:
res <- transform(reshape(as.data.frame(table(df), stringsAsFactors=FALSE),
idvar='x', timevar='y', direction='wide'), x= as.numeric(x))
data
set.seed(24)
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
Benchmarks
set.seed(24)
df <- data.frame(x = round(runif(1e6), 1),
y = factor(ifelse(runif(1e6) > .5, 1, 0),
labels = c('failure', 'success'))
)
tomas <- function(){
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
dfSummary$x <- as.numeric(rownames(dfSummary))
dfSummary <- dfSummary[, c(3, 1, 2)]}
doc <- function(){aggregate(. ~ x, transform(df,
success = y == "success", failure = y == "failure",
y = NULL), sum)}
akrun <- function(){reshape(as.data.frame(table(df)),
idvar='x', timevar='y', direction='wide')}
library(microbenchmark)
microbenchmark(tomas(), doc(), akrun(), unit='relative', times=20L)
Unit: relative
#expr min lq mean median uq max neval cld
#tomas() 1.000000 1.0000000 1.000000 1.000000 1.0000000 1.000000 20 a
#doc() 13.451037 11.5050997 13.082074 13.043584 12.8048306 19.715535 20 b
#akrun() 1.019977 0.9522809 1.012332 1.007569 0.9993835 1.533191 20 a
Updated with dcast.data.table
df1 <- copy(df)
akrun2 <- function() {dcast.data.table(setDT(df1), x~y)}
microbenchmark(tomas(), akrun2(), unit='relative', times=20L)
# Unit: relative
# expr min lq mean median uq max neval cld
# tomas() 6.493231 6.345752 6.410853 6.51594 6.502044 5.591753 20 b
# akrun2() 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 20 a
This should be relatively efficient. You cannot really suppress rownames in a dataframe, since they are a requirement of a valid dataframe
X <- table(df$x,df$y)
cbind( data.frame(x=rownames(X)), unclass(X) )
x failure success
0 0 5 3
0.1 0.1 6 1
0.2 0.2 7 8
0.3 0.3 7 3
0.4 0.4 6 6
0.5 0.5 6 4
0.6 0.6 2 5
0.7 0.7 2 7
0.8 0.8 3 7
0.9 0.9 4 6
1 1 2 0

Resources