Find conditional expectation in R using sapply - r

I have one data frame that has 6 variables.
df is the name of my data frame.
I found the expectation of e (variable in df) using
Ee <- mean(df[["e"]])
How do I find E[e|Z=z] for z in {0,1}?
Similarly, how do I find E[e|X=x] for x in {1...20} using the sapply function?

Here's a thought:
set.seed(42)
sampdata <- data.frame(e = runif(1000), z = sample(0:1, size=1000, replace=TRUE), x = sample(1:20, size=1000, replace=TRUE))
head(sampdata)
# e z x
# 1 0.9148060 1 15
# 2 0.9370754 0 2
# 3 0.2861395 1 13
# 4 0.8304476 1 12
# 5 0.6417455 1 4
# 6 0.5190959 0 7
aggregate(e ~ z, data = sampdata, FUN = mean)
# z e
# 1 0 0.4910876
# 2 1 0.4852118
aggregate(e ~ x, data = sampdata, FUN = mean)
# x e
# 1 1 0.5097038
# 2 2 0.4495141
# 3 3 0.5077897
# 4 4 0.5300375
# 5 5 0.4549345
# 6 6 0.5122537
# 7 7 0.4704425
# 8 8 0.4911532
# 9 9 0.5572367
# 10 10 0.4634067
# 11 11 0.4408758
# 12 12 0.4815633
# 13 13 0.5503166
# 14 14 0.4922317
# 15 15 0.5205427
# 16 16 0.4999023
# 17 17 0.4784551
# 18 18 0.4282990
# 19 19 0.4202285
# 20 20 0.4852303
But if you feel you must use sapply, then this can be equivalent.
sapply(setNames(nm = unique(sampdata$z)), function(Z) mean(sampdata[["e"]][ sampdata[["z"]] == Z ]))
# 1 0
# 0.4852118 0.4910876
sapply(setNames(nm = unique(sampdata$x)), function(X) mean(sampdata[["e"]][ sampdata[["x"]] == X ]))
# 15 2 13 12 4 7 19 16 10 1
# 0.5205427 0.4495141 0.5503166 0.4815633 0.5300375 0.4704425 0.4202285 0.4999023 0.4634067 0.5097038
# 9 3 14 18 11 20 5 8 17 6
# 0.5572367 0.5077897 0.4922317 0.4282990 0.4408758 0.4852303 0.4549345 0.4911532 0.4784551 0.5122537

An option with dplyr
library(dplyr)
sampdata %>%
group_by(z) %>%
summarise(e = mean(e))
data
set.seed(42)
sampdata <- data.frame(e = runif(1000), z = sample(0:1, size=1000, replace=TRUE),
x = sample(1:20, size=1000, replace=TRUE))

Related

How to expand rows and fill in the numbers between given start and end

I have this data frame:
df <- tibble(x = c(1, 10))
x
<dbl>
1 1
2 10
I want this:
x
<int>
1 1
2 2
3 3
4 4
5 5
6 6
7 7
8 8
9 9
10 10
Unfortunately I can't remember how I have to approach. I tried expand.grid, uncount, runner::fill_run.
Update: The real world data ist like this with groups and given start and end number. Here are only two groups:
df <- tibble(group = c("A", "A", "B", "B"),
x = c(10,30, 1, 10))
group x
<chr> <dbl>
1 A 10
2 A 30
3 B 1
4 B 10
We may need full_seq with either summarise or reframe or tidyr::complete
library(dplyr)
df %>%
group_by(group) %>%
reframe(x = full_seq(x, period = 1))
# or with
#tidyr::complete(x = full_seq(x, period = 1))
-output
# A tibble: 31 × 2
group x
<chr> <dbl>
1 A 10
2 A 11
3 A 12
4 A 13
5 A 14
6 A 15
7 A 16
8 A 17
9 A 18
10 A 19
# … with 21 more rows
A simple base R variation:
group <- c(rep("A", 21), rep("B ", 10))
x <- c(10:30, 1:10)
df <- tibble(group, x)
df
# A tibble: 31 × 2
group x
<chr> <int>
1 A 10
2 A 11
3 A 12
4 A 13
5 A 14
6 A 15
And here's an expand.grid solution:
g1 <- expand.grid(group = "A", x = 20:30)
g2 <- expand.grid(group = "B", x = 1:10)
df <- rbind(g1, g2)
df
group x
1 A 20
2 A 21
3 A 22
4 A 23
5 A 24
6 A 25
7 A 26
Using base:
stack(sapply(split(df$x, df$group), function(i) seq(i[ 1 ], i[ 2 ])))

How to sample across a dataset with two factors in it?

I have a dataframe with two species A and B and certain variables a b associated with the total of 100 rows.
I want to create a sampler such that in one set it randomly picks 6 rows reps from the df dataset. However, the samples for A must only come from rows associated with sp A from df, similarly from B. I want do this for 500 times over for each of species A and B.
I attempted a for loop and when I ran sampling it shows a single row with 6 columns. I would appreciate any guidance
a <- rnorm(100, 2,1)
b <- rnorm(100, 2,1)
sp <- rep(c("A","B"), each = 50)
df <- data.frame(a,b,sp)
df.sample <- for(i in 1:1000){
sampling <- sample(df[i,],6,replace = TRUE)
}
#Output in a single row
a a.1 sp b sp.1 a.2
1000 1.68951 1.68951 B 1.395995 B 1.68951
#Expected dataframe
df.sample
set rep a b sp
1 1 1 9 A
1 2 3 2 A
1 3 0 2 A
1 4 1 2 A
1 5 1 6 A
1 6 4 2 A
2 1 1 2 B
2 2 5 2 B
2 3 1 2 B
2 4 1 6 B
2 5 1 8 B
2 6 9 2 B
....
Here's how I would do it (using tidyverse):
data:
a <- rnorm(100, 2,1)
b <- rnorm(100, 2,1)
sp <- rep(c("A","B"), each = 50)
df <- data.frame(a,b,sp)
# create an empty table with desired columns
library(tidyverse)
output <- tibble(a = numeric(),
b = numeric(),
sp = character(),
set = numeric())
# sampling in a loop
set.seed(42)
for(i in 1:500){
samp1 <- df %>% filter(sp == 'A') %>% sample_n(6, replace = TRUE) %>% mutate(set = i)
samp2 <- df %>% filter(sp == 'B') %>% sample_n(6, replace = TRUE) %>% mutate(set = i)
output %>% add_row(bind_rows(samp1, samp2)) -> output
}
Result
> head(output, 20)
# A tibble: 20 × 4
a b sp set
<dbl> <dbl> <chr> <dbl>
1 2.59 3.31 A 1
2 1.84 1.66 A 1
3 2.35 1.17 A 1
4 2.33 1.95 A 1
5 0.418 1.11 A 1
6 1.19 2.54 A 1
7 2.35 0.899 B 1
8 1.19 1.63 B 1
9 0.901 0.986 B 1
10 3.12 1.75 B 1
11 2.28 2.61 B 1
12 1.37 3.47 B 1
13 2.33 1.95 A 2
14 1.84 1.66 A 2
15 3.76 1.26 A 2
16 2.96 3.10 A 2
17 1.03 1.81 A 2
18 1.42 2.00 A 2
19 0.901 0.986 B 2
20 2.37 1.39 B 2
You could split df by species at first. Random rows in each species can be drawn by x[sample(nrow(x), 6), ]. Pass it into replicate(), you could do sampling for many times. Here dplyr::bind_rows() is used to combine samples and add a new column set indicating the sampling indices.
lapply(split(df, df$sp), function(x) {
dplyr::bind_rows(
replicate(3, x[sample(nrow(x), 6), ], FALSE),
.id = "set"
)
})
Output
$A
set a b sp
1 1 1.52480034 3.41257975 A
2 1 1.82542370 2.08511584 A
3 1 1.80019901 1.39279162 A
4 1 2.20765154 2.11879412 A
5 1 1.61295185 2.04035172 A
6 1 1.92936567 2.90362816 A
7 2 0.88903679 2.46948106 A
8 2 3.19223788 2.81329767 A
9 2 1.28629416 2.69275525 A
10 2 2.61044815 0.82495427 A
11 2 2.30928735 1.67421328 A
12 2 -0.09789704 2.62434719 A
13 3 2.10386603 1.78157862 A
14 3 2.17542841 0.84016203 A
15 3 3.22202227 3.49863423 A
16 3 1.07929909 -0.02032945 A
17 3 2.95271838 2.34460193 A
18 3 1.90414536 1.54089645 A
$B
set a b sp
1 1 3.5130317 -0.4704879 B
2 1 3.0053072 1.6021795 B
3 1 4.1167657 1.1123342 B
4 1 1.5460589 3.2915979 B
5 1 0.8742753 0.9132530 B
6 1 2.0882660 1.5588471 B
7 2 1.2444645 1.8199525 B
8 2 2.7960117 2.6657735 B
9 2 2.5970774 0.9984187 B
10 2 1.1977317 3.7360884 B
11 2 2.2830643 1.0452440 B
12 2 3.1047150 1.5609482 B
13 3 2.9309124 1.5679255 B
14 3 0.8631965 1.3501631 B
15 3 1.5460589 3.2915979 B
16 3 2.7960117 2.6657735 B
17 3 3.1047150 1.5609482 B
18 3 2.8735390 0.6329279 B
If I understood well what you want, it could be done following this code
# Create the initial data frame
a <- rnorm(100, 2,1)
b <- rnorm(100, 2,1)
sp <- rep(c("A","B"), each = 50)
df <- data.frame(a,b,sp)
# Rows with sp=A
row.A <- which(df$sp=="A")
row.B <- which(df$sp=="B")
# Sampling data.frame
sampling <- data.frame(matrix(ncol = 5, nrow = 0))
# "rep" column for each iteration
rep1 <- rep(1:6,2)
# Build the dara.frame
for(i in 1:500){
# Sampling row.A
s.A <- sample(row.A,6,replace = T)
# Sampling row.B
s.B <- sample(row.B,6,replace = T)
# Data frame with the subset of df and "set" and "rep" values
sampling <- rbind(sampling, set=cbind(rep(i,12),rep=rep1,df[c(s.A,s.B),]))
}
# Delete row.names of sampling and redefine sampling's column names
row.names(sampling) <- NULL
colnames(sampling) <- c("set", "rep", "a", "b", "sp")
And the output looks like this:
set rep a b sp
1 1 3.713663 2.717456 A
1 2 2.456070 2.803443 A
1 3 2.166655 1.395556 A
1 4 1.453738 5.662969 A
1 5 2.692518 2.971156 A
1 6 2.699634 3.016791 A

Count freq of word and which column it appears in using R

I have a dataset of a series of names in different columns. Each column determines the time in which the names were entered into the system. Is it possible to find the number of times ALL the names appear and the most recent column entry. I added a picture to show how the dataset works.
Here's one method:
library(dplyr)
set.seed(42)
dat <- setNames(as.data.frame(replicate(4, sample(letters, size = 10, replace = TRUE))), 1:4)
dat
# 1 2 3 4
# 1 q x c c
# 2 e g i z
# 3 a d y a
# 4 y y d j
# 5 j e e x
# 6 d n m k
# 7 r t e o
# 8 z z t v
# 9 q r b z
# 10 o o h h
tidyverse
library(dplyr)
library(tidyr)
pivot_longer(dat, everything(), names_to = "colname", values_to = "word") %>%
mutate(colname = as.integer(colname)) %>%
group_by(word) %>%
summarize(n = n(), latest = max(colname), .groups = "drop")
# # A tibble: 20 x 3
# word n latest
# <chr> <int> <int>
# 1 a 2 4
# 2 b 1 3
# 3 c 2 4
# 4 d 3 3
# 5 e 4 3
# 6 g 1 2
# 7 h 2 4
# 8 i 1 3
# 9 j 2 4
# 10 k 1 4
# 11 m 1 3
# 12 n 1 2
# 13 o 3 4
# 14 q 2 1
# 15 r 2 2
# 16 t 2 3
# 17 v 1 4
# 18 x 2 4
# 19 y 3 3
# 20 z 4 4
data.table
library(data.table)
melt(as.data.table(dat), integer(0), variable.name = "colname", value.name = "word")[
, colname := as.integer(colname)
][, .(n = .N, latest = max(colname)), by = .(word) ]
(though it is not sorted by word, the values are the same)

cSplit function in R [duplicate]

I have mydf data frame below. I want to split any cell that contains comma separated data and put it into rows. I am looking for a data frame similar to y below. How could i do it efficiently in few steps? Currently i am using cSplit function on one column at a time.
I tried cSplit(mydf, c("name","new"), ",", direction = "long"), but that didn`t work
library(splitstackshape)
mydf=data.frame(name = c("AB,BW","x,y,z"), AB = c('A','B'), new=c("1,2,3","4,5,6,7"))
mydf
x=cSplit(mydf, c("name"), ",", direction = "long")
x
y=cSplit(x, c("new"), ",", direction = "long")
y
There are times when a for loop is totally fine to work with in R. This is one of those times. Try:
library(splitstackshape)
cols <- c("name", "new")
for (i in cols) {
mydf <- cSplit(mydf, i, ",", "long")
}
mydf
## name AB new
## 1: AB A 1
## 2: AB A 2
## 3: AB A 3
## 4: BW A 1
## 5: BW A 2
## 6: BW A 3
## 7: x B 4
## 8: x B 5
## 9: x B 6
## 10: x B 7
## 11: y B 4
## 12: y B 5
## 13: y B 6
## 14: y B 7
## 15: z B 4
## 16: z B 5
## 17: z B 6
## 18: z B 7
Here's a small test using slightly bigger data:
# concat.test = sample data from "splitstackshape"
test <- do.call(rbind, replicate(5000, concat.test, FALSE))
fun1 <- function() {
cols <- c("Likes", "Siblings")
for (i in cols) {
test <- cSplit(test, i, ",", "long")
}
test
}
fun2 <- function() {
test %>%
separate_rows("Likes") %>%
separate_rows("Siblings")
}
system.time(fun1())
# user system elapsed
# 3.205 0.056 3.261
system.time(fun2())
# user system elapsed
# 11.598 0.066 11.662
We can use the separate_rows function from the tidyr package.
library(tidyr)
mydf2 <- mydf %>%
separate_rows("name") %>%
separate_rows("new")
mydf2
# AB name new
# 1 A AB 1
# 2 A AB 2
# 3 A AB 3
# 4 A BW 1
# 5 A BW 2
# 6 A BW 3
# 7 B x 4
# 8 B x 5
# 9 B x 6
# 10 B x 7
# 11 B y 4
# 12 B y 5
# 13 B y 6
# 14 B y 7
# 15 B z 4
# 16 B z 5
# 17 B z 6
# 18 B z 7
If you don't what to use separate_rows function more than once, we can further design a function to iteratively apply the separate_rows function.
expand_fun <- function(df, vars){
while (length(vars) > 0){
df <- df %>% separate_rows(vars[1])
vars <- vars[-1]
}
return(df)
}
The expand_fun takes two arguments. The first argument, df, is the original data frame. The second argument, vars, is a character string with the columns names we want to expand. Here is an example using the function.
mydf3 <- expand_fun(mydf, vars = c("name", "new"))
mydf3
# AB name new
# 1 A AB 1
# 2 A AB 2
# 3 A AB 3
# 4 A BW 1
# 5 A BW 2
# 6 A BW 3
# 7 B x 4
# 8 B x 5
# 9 B x 6
# 10 B x 7
# 11 B y 4
# 12 B y 5
# 13 B y 6
# 14 B y 7
# 15 B z 4
# 16 B z 5
# 17 B z 6
# 18 B z 7

Automatically rowwise a function for dataframe usage

I have a function**:
do_thing <- function(x) {
return(x + runif(1, 0, 100))
}
That I'd like to apply to my data:
df <- tibble(x = 1:10)
Preferably with mutate:
set.seed(1)
df %>%
mutate(y = do_thing(x))
The function, however, is not performing as expected:
# x y
# 1 1 27.55087
# 2 2 28.55087
# 3 3 29.55087
# 4 4 30.55087
# 5 5 31.55087
# 6 6 32.55087
# 7 7 33.55087
# 8 8 34.55087
# 9 9 35.55087
# 10 10 36.55087
I actually want the function to apply in a rowwise fashion:
df %>%
rowwise() %>%
mutate(y = do_thing(x))
# x y
# 1 1 38.21239
# 2 2 59.28534
# 3 3 93.82078
# 4 4 24.16819
# 5 5 94.83897
# 6 6 100.46753
# 7 7 73.07978
# 8 8 70.91140
# 9 9 15.17863
# 10 10 30.59746
Is there a way that I might be able to rewrite my function so that it is flexible and can automatically default to rowwise while still working with a single input (ie., do_thing(100))?
** actual function is a lot more complex
Instead of getting the runif for 1 observation, we can specify the n as the number of rows (n()) of the dataset
set.seed(24)
df %>%
mutate(y = x + runif(n(), 0, 100))
# A tibble: 10 x 2
# x y
# <int> <dbl>
# 1 1 46.952549
# 2 2 61.939816
# 3 3 94.972191
# 4 4 102.282408
# 5 5 8.780258
# 6 6 63.793740
# 7 7 80.331417
# 8 8 32.874240
# 9 9 39.073652
#10 10 83.346670

Resources