Related
need help in N number or column wise subtraction and division, Below are the columns in a input dataframe.
input dataframe:
> df
A B C D
1 1 3 6 2
2 3 3 3 4
3 1 2 2 2
4 4 4 4 4
5 5 2 3 2
formula - a, (b - a) / (1-a)
MY CODE
ABC <- cbind.data.frame(DF[1], (DF[-1] - DF[-ncol(DF)])/(1 - DF[-ncol(DF)]))
Expected out:
A B C D
1 Inf -1.5 0.8
3 0.00 0.0 -0.5
1 Inf 0.0 0.0
4 0.00 0.0 0.0
5 0.75 -1.0 0.5
But i dont want to use ncol here, cause there is a last column after column D in the actual dataframe.
So want to apply this formula only till first 4 column, IF i use ncol, it will traverse till last column in the dataframe.
Please help thanks.
What about trying:
df <- matrix(c(1,3,6,2,3,3,3,4,1,2,2,2,4,4,4,4,5,2,3,2), nrow = 5, byrow = TRUE)
df_2 <- matrix((df[,2]-df[,1])/(1-df[,1]),5,1)
df_3 <- matrix((df[,3]-df[,2])/(1-df[,2]),5,1)
df_4 <- matrix((df[,4]-df[,3])/(1-df[,3]),5,1)
cbind(df[,1],df_2,df_3,df_4)
edit: a loop version
df <- matrix(c(1,3,6,2,3,3,3,4,1,2,2,2,4,4,4,4,5,2,3,2), nrow = 5, byrow = TRUE)
test_bind <- c()
test_bind <- cbind(test_bind, df[,1])
for (i in 1:3){df_1 <- matrix((df[,i+1]-df[,i])/(1-df[,i]),5,1)
test_bind <- cbind(test_bind,df_1)}
test_bind
here is one option with tidyverse
library(dplyr)
library(purrr)
map2_df(DF[2:4], DF[1:3], ~ (.x - .y)/(1- .y)) %>%
bind_cols(DF[1], .)
# A B C D
#1 1 Inf -1.5 0.8
#2 3 0.00 0.0 -0.5
#3 1 Inf 0.0 0.0
#4 4 0.00 0.0 0.0
#5 5 0.75 -1.0 0.5
I have come to an issue that filtering or slicing the top and bottom n number of rows at the same time from the grouped data.
So it is different than this Select first and last row from grouped data
What I need to do that if sub_gr==a then filter|slice top three rows
if sub_gr==b then filter|slice bottom two rows that's it!
my data something like this
df <- data.frame(gr=rep(seq(1,2),each=10),sub_gr=rep(rep(c("a","b"),each=5),2),
y = rep(c(sort(runif(5,0,0.5),decreasing=TRUE), sort(runif(5,0,0.5),,decreasing=TRUE)),2),
x = rep(c(seq(0.1,0.5,0.1),rev(seq(-0.5,-0.1,0.1))),2))
gr sub_gr y x
1 1 a 0.37851909 0.1
2 1 a 0.33305165 0.2
3 1 a 0.22478005 0.3
4 1 a 0.09677654 0.4
5 1 a 0.07060651 0.5
6 1 b 0.41999445 -0.1
7 1 b 0.35356301 -0.2
8 1 b 0.33274398 -0.3
9 1 b 0.20451400 -0.4
10 1 b 0.03714828 -0.5
11 2 a 0.37851909 0.1
12 2 a 0.33305165 0.2
13 2 a 0.22478005 0.3
14 2 a 0.09677654 0.4
15 2 a 0.07060651 0.5
16 2 b 0.41999445 -0.1
17 2 b 0.35356301 -0.2
18 2 b 0.33274398 -0.3
19 2 b 0.20451400 -0.4
20 2 b 0.03714828 -0.5
library(dplyr)
Here is what I tried,
df%>%
group_by(gr, sub_gr)%>%
slice(if(any(sub_gr=="a")) {row_number()==1:3} else {row_number()==4:n()})
Warning messages:
1: In 1:5 == 1:3 :
longer object length is not a multiple of shorter object length
2: In 1:5 == 4:5L :
longer object length is not a multiple of shorter object length
3: In 1:5 == 1:3 :
longer object length is not a multiple of shorter object length
4: In 1:5 == 4:5L :
longer object length is not a multiple of shorter object length
thanks for your help in advance!
There are probably more elegant solutions, but I think the following works. I set seed for reproducibility.
set.seed(123)
df <- data.frame(gr=rep(seq(1,2),each=10),sub_gr=rep(rep(c("a","b"),each=5),2),
y = rep(c(sort(runif(5,0,0.5),decreasing=TRUE), sort(runif(5,0,0.5),,decreasing=TRUE)),2),
x = rep(c(seq(0.1,0.5,0.1),rev(seq(-0.5,-0.1,0.1))),2))
df %>%
group_by(gr, sub_gr) %>%
filter((sub_gr %in% "a" & row_number() %in% 1:3) |
(sub_gr %in% "b" & row_number() %in% (n() - 1):n())) %>%
ungroup()
# # A tibble: 10 x 4
# gr sub_gr y x
# <int> <fctr> <dbl> <dbl>
# 1 1 a 0.47023364 0.1
# 2 1 a 0.44150870 0.2
# 3 1 a 0.39415257 0.3
# 4 1 b 0.22830737 -0.4
# 5 1 b 0.02277825 -0.5
# 6 2 a 0.47023364 0.1
# 7 2 a 0.44150870 0.2
# 8 2 a 0.39415257 0.3
# 9 2 b 0.22830737 -0.4
# 10 2 b 0.02277825 -0.5
library(tidyverse)
# create a custom function to take the head or tail based on your rule
cond_slice <- function(x) {
if (unique(x$sub_gr) == "a") {
head(x, 3)
} else {
tail(x, 2)
}
}
# create a column to split by and then map across the subsets
result <- x %>%
unite(split_by, gr, sub_gr, remove = F) %>%
split(.$split_by) %>%
map(cond_slice) %>%
bind_rows() %>%
select(-split_by)
I have a dataframe df with columns ID, X and Y
ID = c(1,1,2,2)
X = c(1,0.4,0.8,0.1)
Y = c(0.5,0.5,0.7,0.7)
df <- data.frame(ID,X,Y)
ID X Y
1 1.0 0.5
1 0.4 0.5
2 0.8 0.7
2 0.1 0.7
I would like to obtain two new columns:
Xg equal to X when X is greater than Y and NA otherwise
Xl equal to X when X is less than Y and NA otherwise. That is,
ID X Y Xg Xl
1 1.0 0.5 1.0 NA
1 0.4 0.5 NA 0.4
2 0.8 0.7 0.8 NA
2 0.1 0.7 NA 0.1
Below should work, even if there are NA's in X or Y:
library(dplyr)
df %>%
mutate(Xg = ifelse(X > Y, X, NA),
Xl = ifelse(X < Y, Y, NA))
If you want to use if_else from dplyr, you have to convert NA to numeric. if_else is stricter than ifelse in that it checks whether the TRUE and FALSE values are the same type:
df %>%
mutate(Xg = if_else(X > Y, X, as.numeric(NA)),
Xl = if_else(X < Y, Y, as.numeric(NA)))
Result:
ID X Y Xg Xl
1 1 1.0 0.5 1.0 NA
2 1 0.4 0.5 NA 0.5
3 2 0.8 0.7 0.8 NA
4 2 0.1 0.7 NA 0.7
5 3 NA 1.0 NA NA
6 3 3.0 NA NA NA
Data:
ID = c(1,1,2,2,3,3)
X = c(1,0.4,0.8,0.1,NA,3)
Y = c(0.5,0.5,0.7,0.7,1,NA)
df <- data.frame(ID,X,Y)
What about some plain old R indexing and subsetting?
ID <- c(1,1,2,2, 3, 3)
X <- c(1,0.4,0.8,0.1, NA, 2)
Y <- c(0.5,0.5,0.7,0.7, 2, NA)
Xg <- Xl <- rep(NA_real_, length(ID))
Xg[which(X > Y)] <- X[which(X > Y)]
Xl[which(X < Y)] <- X[which(X < Y)]
data.frame(ID, X, Y, Xg, Xl)
Note: I assume that if X or Y is missing, Xg and Xl should be NA.
For the sake of completeness and as the question originally used data.table() before it was edited (and because I like the concise code) here is "one-liner" using data.table's update in place:
library(data.table)
setDT(df)[X > Y, Xg := X][X < Y, Xl := X][]
ID X Y Xg Xl
1: 1 1.0 0.5 1.0 NA
2: 1 0.4 0.5 NA 0.4
3: 2 0.8 0.7 0.8 NA
4: 2 0.1 0.7 NA 0.1
5: 3 NA 1.0 NA NA
6: 3 3.0 NA NA NA
(Using the data of useR)
NA's are handled automatically as only matching rows are updated.
Recently, I have found that I am using the following pattern over and over again. The process is:
cross-tabulate numeric variable by factor using table
create data frame from created table
add original numeric values to data frame (from row names (!))
remove row names
reorder columns of aggregated data frame
In R, it looks like this:
# Sample data
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
# Get frequencies
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
# Add column of original values from rownames
dfSummary$x <- as.numeric(rownames(dfSummary))
# Remove rownames
rownames(dfSummary) <- NULL
# Reorder columns
dfSummary <- dfSummary[, c(3, 1, 2)]
Is there anything more elegant in R, preferably using base functions? I know I can use sql to do this in single command - I think that it has to be possible to achieve similar behavior in R.
sqldf solution:
library(sqldf)
dfSummary <- sqldf("select
x,
sum(y = 'failure') as failure,
sum(y = 'success') as success
from df group by x")
An alternative with base R could be:
aggregate(. ~ x, transform(df, success = y == "sucess",
failure = y == "failure", y = NULL), sum)
# x success failure
#1 0.0 2 4
#2 0.1 6 8
#3 0.2 1 7
#4 0.3 5 4
#5 0.4 6 6
#6 0.5 3 3
#7 0.6 4 6
#8 0.7 6 6
#9 0.8 4 5
#10 0.9 6 7
#11 1.0 1 0
Your code modified as a function would be efficient compared to the other solutions in base R (so far). If you wanted the code in one-line, a "reshape/table" combo from base R could be used.
reshape(as.data.frame(table(df)), idvar='x', timevar='y',
direction='wide')
# x Freq.failure Freq.success
#1 0 3 2
#2 0.1 3 9
#3 0.2 5 5
#4 0.3 8 7
#5 0.4 5 3
#6 0.5 9 4
#7 0.6 3 6
#8 0.7 7 6
#9 0.8 3 1
#10 0.9 4 3
#11 1 0 4
In case you want to try data.table
library(data.table)
dcast.data.table(setDT(df), x~y)
# x failure success
# 1: 0.0 3 2
# 2: 0.1 3 9
# 3: 0.2 5 5
# 4: 0.3 8 7
# 5: 0.4 5 3
# 6: 0.5 9 4
# 7: 0.6 3 6
# 8: 0.7 7 6
# 9: 0.8 3 1
#10: 0.9 4 3
#11: 1.0 0 4
Update
I didn't notice the as.data.frame(table( converts to "factor" columns (thanks to #Hadley's comment). A workaround is:
res <- transform(reshape(as.data.frame(table(df), stringsAsFactors=FALSE),
idvar='x', timevar='y', direction='wide'), x= as.numeric(x))
data
set.seed(24)
df <- data.frame(x = round(runif(100), 1),
y = factor(ifelse(runif(100) > .5, 1, 0),
labels = c('failure', 'success'))
)
Benchmarks
set.seed(24)
df <- data.frame(x = round(runif(1e6), 1),
y = factor(ifelse(runif(1e6) > .5, 1, 0),
labels = c('failure', 'success'))
)
tomas <- function(){
dfSummary <- as.data.frame.matrix(table(df$x, df$y))
dfSummary$x <- as.numeric(rownames(dfSummary))
dfSummary <- dfSummary[, c(3, 1, 2)]}
doc <- function(){aggregate(. ~ x, transform(df,
success = y == "success", failure = y == "failure",
y = NULL), sum)}
akrun <- function(){reshape(as.data.frame(table(df)),
idvar='x', timevar='y', direction='wide')}
library(microbenchmark)
microbenchmark(tomas(), doc(), akrun(), unit='relative', times=20L)
Unit: relative
#expr min lq mean median uq max neval cld
#tomas() 1.000000 1.0000000 1.000000 1.000000 1.0000000 1.000000 20 a
#doc() 13.451037 11.5050997 13.082074 13.043584 12.8048306 19.715535 20 b
#akrun() 1.019977 0.9522809 1.012332 1.007569 0.9993835 1.533191 20 a
Updated with dcast.data.table
df1 <- copy(df)
akrun2 <- function() {dcast.data.table(setDT(df1), x~y)}
microbenchmark(tomas(), akrun2(), unit='relative', times=20L)
# Unit: relative
# expr min lq mean median uq max neval cld
# tomas() 6.493231 6.345752 6.410853 6.51594 6.502044 5.591753 20 b
# akrun2() 1.000000 1.000000 1.000000 1.00000 1.000000 1.000000 20 a
This should be relatively efficient. You cannot really suppress rownames in a dataframe, since they are a requirement of a valid dataframe
X <- table(df$x,df$y)
cbind( data.frame(x=rownames(X)), unclass(X) )
x failure success
0 0 5 3
0.1 0.1 6 1
0.2 0.2 7 8
0.3 0.3 7 3
0.4 0.4 6 6
0.5 0.5 6 4
0.6 0.6 2 5
0.7 0.7 2 7
0.8 0.8 3 7
0.9 0.9 4 6
1 1 2 0
I have a matrix:
mat<-matrix(NA, ncol=7,nrow=9)
mat[,1]<-c(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9)
mat[,2]<-c(2,4,5,6,7,7,7,8,9)
mat[,3]<-c(2,48,63,72,81,100,100,100,100)
mat[,4]<-c(1,2,3,3,4,4,5,5,6)
mat[,5]<-c(1,2,6,7,8,8,9,10,10)
mat[,6]<-c(1,1,1,2,3,3,4,4,4)
mat[,7]<-c(1,1,1,3,4,4,4,5,5)
colnames(mat)<-c("facet","A1","A2","B1","B2","C1","C2")
facet A1 A2 B1 B2 C1 C2
[1,] 0.1 2 2 1 1 1 1
[2,] 0.2 4 48 2 2 1 1
[3,] 0.3 5 63 3 6 1 1
[4,] 0.4 6 72 3 7 2 3
[5,] 0.5 7 81 4 8 3 4
[6,] 0.6 7 100 4 8 3 4
[7,] 0.7 7 100 5 9 4 4
[8,] 0.8 8 100 5 10 4 5
[9,] 0.9 9 100 6 10 4 5
I would like to create the following plot:
Create 9 separate plots faceted by "facet".
Each plot should contain the following:
on the same position on the x axis plot A1 and A2 using points, i.e. (X=1, y=A1) and (X=1,y=A2)
on the same position on the x axis plot B1 and B2 using points, i.e. (X=2, y=B1) and (X=2,y=B2)
on the same position on the x axis plot C1 and C2 using points, i.e. (X=3, y=C1) and (X=3,y=C2)
How can this be done? I understand how to do faceting but I'm struggling with plotting the two values in the same position on the x axis and repeating for each A,B and C. can someone help?
First, reshape your matrix to a data frame in the long format:
library(reshape2)
dat <- melt(as.data.frame(mat), id.vars = "facet")
> head(dat)
# facet variable value
# 1 0.1 A1 2
# 2 0.2 A1 4
# 3 0.3 A1 5
# 4 0.4 A1 6
# 5 0.5 A1 7
# 6 0.6 A1 7
Then, create two variables based on the information in the column variable:
dat2 <- transform(dat, fac = substr(variable, 2, 2),
variable = substr(variable, 1, 1))
> head(dat2)
# facet variable value fac
# 1 0.1 A 2 1
# 2 0.2 A 4 1
# 3 0.3 A 5 1
# 4 0.4 A 6 1
# 5 0.5 A 7 1
# 6 0.6 A 7 1
Plot:
library(ggplot2)
ggplot(dat2, aes(x = variable, y = value)) +
geom_point(aes(colour = fac)) +
facet_wrap( ~ facet)
a <- cbind(mat[, 1], mat[, 2], 1, 1)
b <- cbind(mat[, 1], mat[, 3], 1, 2)
c <- cbind(mat[, 1], mat[, 4], 2, 1)
d <- cbind(mat[, 1], mat[, 5], 2, 2)
e <- cbind(mat[, 1], mat[, 6], 3, 1)
f <- cbind(mat[, 1], mat[, 7], 3, 2)
data <- as.data.frame(rbind(a, b, c, d, e, f))
colnames(data) <- c("facet", "value", "type", "time")
data$type <- factor(data$type, labels = c("A", "B", "C"))
ggplot(data, aes(y = value, x = type, fill = factor(time))) +
geom_point(aes(color = factor(time)),
position = position_jitter(w = 0.1, h = 0.0))+
facet_wrap(~facet)