cSplit function in R [duplicate] - r

I have mydf data frame below. I want to split any cell that contains comma separated data and put it into rows. I am looking for a data frame similar to y below. How could i do it efficiently in few steps? Currently i am using cSplit function on one column at a time.
I tried cSplit(mydf, c("name","new"), ",", direction = "long"), but that didn`t work
library(splitstackshape)
mydf=data.frame(name = c("AB,BW","x,y,z"), AB = c('A','B'), new=c("1,2,3","4,5,6,7"))
mydf
x=cSplit(mydf, c("name"), ",", direction = "long")
x
y=cSplit(x, c("new"), ",", direction = "long")
y

There are times when a for loop is totally fine to work with in R. This is one of those times. Try:
library(splitstackshape)
cols <- c("name", "new")
for (i in cols) {
mydf <- cSplit(mydf, i, ",", "long")
}
mydf
## name AB new
## 1: AB A 1
## 2: AB A 2
## 3: AB A 3
## 4: BW A 1
## 5: BW A 2
## 6: BW A 3
## 7: x B 4
## 8: x B 5
## 9: x B 6
## 10: x B 7
## 11: y B 4
## 12: y B 5
## 13: y B 6
## 14: y B 7
## 15: z B 4
## 16: z B 5
## 17: z B 6
## 18: z B 7
Here's a small test using slightly bigger data:
# concat.test = sample data from "splitstackshape"
test <- do.call(rbind, replicate(5000, concat.test, FALSE))
fun1 <- function() {
cols <- c("Likes", "Siblings")
for (i in cols) {
test <- cSplit(test, i, ",", "long")
}
test
}
fun2 <- function() {
test %>%
separate_rows("Likes") %>%
separate_rows("Siblings")
}
system.time(fun1())
# user system elapsed
# 3.205 0.056 3.261
system.time(fun2())
# user system elapsed
# 11.598 0.066 11.662

We can use the separate_rows function from the tidyr package.
library(tidyr)
mydf2 <- mydf %>%
separate_rows("name") %>%
separate_rows("new")
mydf2
# AB name new
# 1 A AB 1
# 2 A AB 2
# 3 A AB 3
# 4 A BW 1
# 5 A BW 2
# 6 A BW 3
# 7 B x 4
# 8 B x 5
# 9 B x 6
# 10 B x 7
# 11 B y 4
# 12 B y 5
# 13 B y 6
# 14 B y 7
# 15 B z 4
# 16 B z 5
# 17 B z 6
# 18 B z 7
If you don't what to use separate_rows function more than once, we can further design a function to iteratively apply the separate_rows function.
expand_fun <- function(df, vars){
while (length(vars) > 0){
df <- df %>% separate_rows(vars[1])
vars <- vars[-1]
}
return(df)
}
The expand_fun takes two arguments. The first argument, df, is the original data frame. The second argument, vars, is a character string with the columns names we want to expand. Here is an example using the function.
mydf3 <- expand_fun(mydf, vars = c("name", "new"))
mydf3
# AB name new
# 1 A AB 1
# 2 A AB 2
# 3 A AB 3
# 4 A BW 1
# 5 A BW 2
# 6 A BW 3
# 7 B x 4
# 8 B x 5
# 9 B x 6
# 10 B x 7
# 11 B y 4
# 12 B y 5
# 13 B y 6
# 14 B y 7
# 15 B z 4
# 16 B z 5
# 17 B z 6
# 18 B z 7

Related

Find conditional expectation in R using sapply

I have one data frame that has 6 variables.
df is the name of my data frame.
I found the expectation of e (variable in df) using
Ee <- mean(df[["e"]])
How do I find E[e|Z=z] for z in {0,1}?
Similarly, how do I find E[e|X=x] for x in {1...20} using the sapply function?
Here's a thought:
set.seed(42)
sampdata <- data.frame(e = runif(1000), z = sample(0:1, size=1000, replace=TRUE), x = sample(1:20, size=1000, replace=TRUE))
head(sampdata)
# e z x
# 1 0.9148060 1 15
# 2 0.9370754 0 2
# 3 0.2861395 1 13
# 4 0.8304476 1 12
# 5 0.6417455 1 4
# 6 0.5190959 0 7
aggregate(e ~ z, data = sampdata, FUN = mean)
# z e
# 1 0 0.4910876
# 2 1 0.4852118
aggregate(e ~ x, data = sampdata, FUN = mean)
# x e
# 1 1 0.5097038
# 2 2 0.4495141
# 3 3 0.5077897
# 4 4 0.5300375
# 5 5 0.4549345
# 6 6 0.5122537
# 7 7 0.4704425
# 8 8 0.4911532
# 9 9 0.5572367
# 10 10 0.4634067
# 11 11 0.4408758
# 12 12 0.4815633
# 13 13 0.5503166
# 14 14 0.4922317
# 15 15 0.5205427
# 16 16 0.4999023
# 17 17 0.4784551
# 18 18 0.4282990
# 19 19 0.4202285
# 20 20 0.4852303
But if you feel you must use sapply, then this can be equivalent.
sapply(setNames(nm = unique(sampdata$z)), function(Z) mean(sampdata[["e"]][ sampdata[["z"]] == Z ]))
# 1 0
# 0.4852118 0.4910876
sapply(setNames(nm = unique(sampdata$x)), function(X) mean(sampdata[["e"]][ sampdata[["x"]] == X ]))
# 15 2 13 12 4 7 19 16 10 1
# 0.5205427 0.4495141 0.5503166 0.4815633 0.5300375 0.4704425 0.4202285 0.4999023 0.4634067 0.5097038
# 9 3 14 18 11 20 5 8 17 6
# 0.5572367 0.5077897 0.4922317 0.4282990 0.4408758 0.4852303 0.4549345 0.4911532 0.4784551 0.5122537
An option with dplyr
library(dplyr)
sampdata %>%
group_by(z) %>%
summarise(e = mean(e))
data
set.seed(42)
sampdata <- data.frame(e = runif(1000), z = sample(0:1, size=1000, replace=TRUE),
x = sample(1:20, size=1000, replace=TRUE))

Count freq of word and which column it appears in using R

I have a dataset of a series of names in different columns. Each column determines the time in which the names were entered into the system. Is it possible to find the number of times ALL the names appear and the most recent column entry. I added a picture to show how the dataset works.
Here's one method:
library(dplyr)
set.seed(42)
dat <- setNames(as.data.frame(replicate(4, sample(letters, size = 10, replace = TRUE))), 1:4)
dat
# 1 2 3 4
# 1 q x c c
# 2 e g i z
# 3 a d y a
# 4 y y d j
# 5 j e e x
# 6 d n m k
# 7 r t e o
# 8 z z t v
# 9 q r b z
# 10 o o h h
tidyverse
library(dplyr)
library(tidyr)
pivot_longer(dat, everything(), names_to = "colname", values_to = "word") %>%
mutate(colname = as.integer(colname)) %>%
group_by(word) %>%
summarize(n = n(), latest = max(colname), .groups = "drop")
# # A tibble: 20 x 3
# word n latest
# <chr> <int> <int>
# 1 a 2 4
# 2 b 1 3
# 3 c 2 4
# 4 d 3 3
# 5 e 4 3
# 6 g 1 2
# 7 h 2 4
# 8 i 1 3
# 9 j 2 4
# 10 k 1 4
# 11 m 1 3
# 12 n 1 2
# 13 o 3 4
# 14 q 2 1
# 15 r 2 2
# 16 t 2 3
# 17 v 1 4
# 18 x 2 4
# 19 y 3 3
# 20 z 4 4
data.table
library(data.table)
melt(as.data.table(dat), integer(0), variable.name = "colname", value.name = "word")[
, colname := as.integer(colname)
][, .(n = .N, latest = max(colname)), by = .(word) ]
(though it is not sorted by word, the values are the same)

ddply multifactorial all pairwise t-tests

How to perform a multifactorial t-test for all possible pairs of groups with a minimal number of coding lines.
My example:
3x features : 1,2,3
4x groups: : A,B,C,D
Aim: For each feature test all pairs of groups:
1(A-B,A-C,A-D,B-C,B-D,C-D)
2(A-B,A-C,A-D,B-C,B-D,C-D)
3(A-B,A-C,A-D,B-C,B-D,C-D)
= 18 T-tests
At the moment I am using ddply and inside lapply :
library(plyr)
groupVector <- c(rep("A",10),rep("B",10),rep("C",10),rep("D",10))
featureVector <- rep(1:3,each=40)
mydata <- data.frame(feature=factorVector,group=groupVector,value=rnorm(120,0,1))
ddply(mydata,.(feature),function(x){
grid <- combn(unique(x$group),2, simplify = FALSE)
df <- lapply(grid,function(p){
sub <- subset(x,group %in% p)
pval <- t.test(sub$value ~ sub$group)$p.value
data.frame(groupA=p[1],groupB=p[2],pval=pval)
})
res <- do.call("rbind",df)
return(res)
})
Here's my take, although it's arguable whether it's 'better'
split.data <- split(mydata, mydata$feature)
pairs <- as.data.frame(matrix(combn(unique(mydata$group), 2), nrow=2))
library(tidyverse)
map_df(split.data, function(x) map_df(pairs, function(y) tibble(groupA = y[1], groupB = y[2],
pval = t.test(value ~ group, data = x, subset = which(x$group %in% y))$p.value)), .id="feature")
Output
# # A tibble: 18 x 4
# feature groupA groupB pval
# <chr> <chr> <chr> <dbl>
# 1 1 A B 0.28452419
# 2 1 A C 0.65114472
# 3 1 A D 0.77746420
# 4 1 B C 0.42546791
# 5 1 B D 0.39876582
# 6 1 C D 0.88079645
# 7 2 A B 0.57843592
# 8 2 A C 0.30726571
# 9 2 A D 0.55457986
# 10 2 B C 0.74871464
# 11 2 B D 0.24017130
# 12 2 C D 0.04252878
# 13 3 A B 0.01355117
# 14 3 A C 0.08746756
# 15 3 A D 0.24527519
# 16 3 B C 0.15130684
# 17 3 B D 0.09172577
# 18 3 C D 0.64206517

How to merge and sum two data frames

Here is my issue:
df1 <- data.frame(x = 1:5, y = 2:6, z = 3:7)
rownames(df1) <- LETTERS[1:5]
df1
x y z
A 1 2 3
B 2 3 4
C 3 4 5
D 4 5 6
E 5 6 7
df2 <- data.frame(x = 1:5, y = 2:6, z = 3:7)
rownames(df2) <- LETTERS[3:7]
df2
x y z
C 1 2 3
D 2 3 4
E 3 4 5
F 4 5 6
G 5 6 7
what I wanted is:
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
where duplicated rows were added up by same variable.
A solution with base R:
# create a new variable from the rownames
df1$rn <- rownames(df1)
df2$rn <- rownames(df2)
# bind the two dataframes together by row and aggregate
res <- aggregate(cbind(x,y,z) ~ rn, rbind(df1,df2), sum)
# or (thx to #alistaire for reminding me):
res <- aggregate(. ~ rn, rbind(df1,df2), sum)
# assign the rownames again
rownames(res) <- res$rn
# get rid of the 'rn' column
res <- res[, -1]
which gives:
> res
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
With dplyr,
library(dplyr)
# add rownames as a column in each data.frame and bind rows
bind_rows(df1 %>% add_rownames(),
df2 %>% add_rownames()) %>%
# evaluate following calls for each value in the rowname column
group_by(rowname) %>%
# add all non-grouping variables
summarise_all(sum)
## # A tibble: 7 x 4
## rowname x y z
## <chr> <int> <int> <int>
## 1 A 1 2 3
## 2 B 2 3 4
## 3 C 4 6 8
## 4 D 6 8 10
## 5 E 8 10 12
## 6 F 4 5 6
## 7 G 5 6 7
could also vectorize the operation turning the dfs to matrices:
result_df <- as.data.frame(as.matrix(df1) + as.matrix(df2))
This might need some teaking to get the rownames logic working on a longer example:
dfr <-rbind(df1,df2)
do.call(rbind, lapply( split(dfr, sapply(rownames(dfr),substr,1,1)), colSums))
x y z
A 1 2 3
B 2 3 4
C 4 6 8
D 6 8 10
E 8 10 12
F 4 5 6
G 5 6 7
If the rownames could all be assumed to be alpha characters a gsub solution should be easy.
An alternative is to melt the data and cast it. At first we set the row names to the last column of both data frames thanks to #Jaap
df1$rn <- rownames(df1)
df2$rn <- rownames(df2)
Then we melt the data based on the name
melt(list(df1, df2), id.vars = "rn")
Then we use dcast with mget function which is used to retrieve multiple variables at once.
mydf<- dcast(melt(mget(ls(pattern = "df\\d+")), id.vars = "rn"),
rn ~ variable, value.var = "value", fun.aggregate = sum)
rownames(mydf) <- mydf$rn
# get rid of the 'rn' column
mydf <- mydf[, -1]
> mydf
# x y z
#A 1 2 3
#B 2 3 4
#C 4 6 8
#D 6 8 10
#E 8 10 12
#F 4 5 6
#G 5 6 7

In R, split a dataframe so subset dataframes contain last row of previous dataframe and first row of subsequent dataframe

There are many answers for how to split a dataframe, for example How to split a data frame?
However, I'd like to split a dataframe so that the smaller dataframes contain the last row of the previous dataframe and the first row of the following dataframe.
Here's an example
n <- 1:9
group <- rep(c("a","b","c"), each = 3)
data.frame(n = n, group)
n group
1 1 a
2 2 a
3 3 a
4 4 b
5 5 b
6 6 b
7 7 c
8 8 c
9 9 c
I'd like the output to look like:
d1 <- data.frame(n = 1:4, group = c(rep("a",3),"b"))
d2 <- data.frame(n = 3:7, group = c("a",rep("b",3),"c"))
d3 <- data.frame(n = 6:9, group = c("b",rep("c",3)))
d <- list(d1, d2, d3)
d
[[1]]
n group
1 1 a
2 2 a
3 3 a
4 4 b
[[2]]
n group
1 3 a
2 4 b
3 5 b
4 6 b
5 7 c
[[3]]
n group
1 6 b
2 7 c
3 8 c
4 9 c
What is an efficient way to accomplish this task?
Suppose DF is the original data.frame, the one with columns n and group. Let n be the number of rows in DF. Now define a function extract which given a sequence of indexes ix enlarges it to include the one prior to the first and after the last and then returns those rows of DF. Now that we have defined extract, split the vector 1, ..., n by group and apply extract to each component of the split.
n <- nrow(DF)
extract <- function(ix) DF[seq(max(1, min(ix) - 1), min(n, max(ix) + 1)), ]
lapply(split(seq_len(n), DF$group), extract)
$a
n group
1 1 a
2 2 a
3 3 a
4 4 b
$b
n group
3 3 a
4 4 b
5 5 b
6 6 b
7 7 c
$c
n group
6 6 b
7 7 c
8 8 c
9 9 c
Or why not try good'ol by, which "[a]ppl[ies] a Function to a Data Frame Split by Factors [INDICES]".
by(data = df, INDICES = df$group, function(x){
id <- c(min(x$n) - 1, x$n, max(x$n) + 1)
na.omit(df[id, ])
})
# df$group: a
# n group
# 1 1 a
# 2 2 a
# 3 3 a
# 4 4 b
# --------------------------------------------------------------------------------
# df$group: b
# n group
# 3 3 a
# 4 4 b
# 5 5 b
# 6 6 b
# 7 7 c
# --------------------------------------------------------------------------------
# df$group: c
# n group
# 6 6 b
# 7 7 c
# 8 8 c
# 9 9 c
Although the print method of by creates a 'fancy' output, the (default) result is a list, with elements named by the levels of the grouping variable (just try str and names on the resulting object).
I was going to comment under #cdetermans answer but its too late now.
You can generalize his approach using data.table::shift (or dyplr::lag) in order to find the group indices and then run a simple lapply on the ranges, something like
library(data.table) # v1.9.6+
indx <- setDT(df)[, which(group != shift(group, fill = TRUE))]
lapply(Map(`:`, c(1L, indx - 1L), c(indx, nrow(df))), function(x) df[x,])
# [[1]]
# n group
# 1: 1 a
# 2: 2 a
# 3: 3 a
# 4: 4 b
#
# [[2]]
# n group
# 1: 3 a
# 2: 4 b
# 3: 5 b
# 4: 6 b
# 5: 7 c
#
# [[3]]
# n group
# 1: 6 b
# 2: 7 c
# 3: 8 c
# 4: 9 c
Could be done with data.frame as well, but is there ever a reason not to use data.table? Also this has the option to be executed with parallelism.
library(data.table)
n <- 1:9
group <- rep(c("a","b","c"), each = 3)
df <- data.table(n = n, group)
df[, `:=` (group = factor(df$group))]
df[, `:=` (group_i = seq_len(.N), group_N = .N), by = "group"]
library(doParallel)
groups <- unique(df$group)
foreach(i = seq(groups)) %do% {
df[group == groups[i] | (as.integer(group) == i + 1 & group_i == 1) | (as.integer(group) == i - 1 & group_i == group_N), c("n", "group"), with = FALSE]
}
[[1]]
n group
1: 1 a
2: 2 a
3: 3 a
4: 4 b
[[2]]
n group
1: 3 a
2: 4 b
3: 5 b
4: 6 b
5: 7 c
[[3]]
n group
1: 6 b
2: 7 c
3: 8 c
4: 9 c
Here is another dplyr way:
library(dplyr)
data =
data_frame(n = n, group) %>%
group_by(group)
firsts =
data %>%
slice(1) %>%
ungroup %>%
mutate(new_group = lag(group)) %>%
slice(-1)
lasts =
data %>%
slice(n()) %>%
ungroup %>%
mutate(new_group = lead(group)) %>%
slice(-n())
bind_rows(firsts, data, lasts) %>%
mutate(final_group =
ifelse(is.na(new_group),
group,
new_group) ) %>%
arrange(final_group, n) %>%
group_by(final_group)

Resources