I have a dataset of this shape.
group a1 a2 ... a9 b1 b2 ... b7
1 1 0 ... 1 0 1 ... 1
1 1 1 ... 1 0 0 ... 1
1 0 0 ... 0 1 0 ... 1
1 1 1 ... 0 1 1 ... 0
2 1 0 ... 1 0 1 ... 1
2 1 1 ... 1 0 0 ... 1
2 0 0 ... 0 1 0 ... 1
2 1 1 ... 0 1 1 ... 0
...
and what I'd like to do is apply a two-argument summary function to all pairs of columns, maintaining the grouped nature of the data.
So, for example
f = function(a, b) { mean(a) + mean(b) + mean(a & b) }
would return something like (I'm not actually going to compute the value of the function, I'll just put "x" to indicate where the stat would go, but of course it would be different for each group-a-b combination).
group a_col b_col stat
1 a1 b1 x
1 a1 b2 x
1 a1 b3 x
...
1 a9 b7 x
2 a1 b1 x
...
A commenter asked for some sample data. Here it is:
structure(list(group = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 10L, 10L), a1 = c(0L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
1L, 0L, 0L, 0L), a2 = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L), a3 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,
0L, 0L), a4 = c(0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), a5 = c(1L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L
), b1 = c(1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L), b2 = c(0L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L),
b3 = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-37L))
A solution using tidyverse. We can gather the columns based on the starting letters twice and then conduct the operation. Assuming your data is called dat, dat2 is the final output.
library(tidyverse)
dat2 <- dat %>%
gather(column_a, value_a, starts_with("a")) %>%
gather(column_b, value_b, starts_with("b")) %>%
group_by(group, column_a, column_b) %>%
summarise(stat = mean(value_a) + mean(value_b) + mean(value_a + value_b)) %>%
ungroup()
dat2
# # A tibble: 150 x 4
# group column_a column_b stat
# <int> <chr> <chr> <dbl>
# 1 1 a1 b1 3
# 2 1 a1 b2 2
# 3 1 a1 b3 2
# 4 1 a2 b1 2
# 5 1 a2 b2 1
# 6 1 a2 b3 1
# 7 1 a3 b1 3.5
# 8 1 a3 b2 2.5
# 9 1 a3 b3 2.5
# 10 1 a4 b1 2
# # ... with 140 more rows
Related
I have a data frame of 1401 observations of 16 variables. For each column (except the first one), I have either 1 (if a condition is met) or 0 (if a condition is not met). Overall, the idea is to count how many observations meet certain conditions successively. We can think about it as a decision tree: in the first branch you can have either 1 (condition is met) or 0 (condition is not met), in the second branch starting from the 0 of the first branch, you can also have 1 or 0, etc... In my data frame, branches are columns. I want to investigate the impact of looking at the different branches (columns) in various orders.
My idea is to count the number of "1" in column Cn if I know that there was a "0" in column Cn-1.
dput(droplevels(head(data,20)))
structure(list(Substance = structure(c(13L, 9L, 10L, 12L, 1L,
19L, 16L, 17L, 5L, 2L, 14L, 7L, 4L, 6L, 20L, 18L, 15L, 3L, 11L,
8L), .Label = c("104653-34-1", "107-02-8", "111-30-8", "12057-74-8",
"122454-29-9", "14915-37-8", "20859-73-8", "27083-27-8", "28772-56-7",
"3691-35-8", "55965-84-9", "56073-07-5", "56073-10-0", "5836-29-3",
"71751-41-2", "74-90-8", "81-81-2", "86347-14-0", "90035-08-8",
"91465-08-6"), class = "factor"), colA = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
colB = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), colC = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), colD = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), colE = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 1L), colF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), colG = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L), colH = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), colI = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L
), colK = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L), colJ = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L), colL = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L), colM = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), colN = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), colO = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Substance",
"Oral", "Dermal", "Inhalation", "SC", "SED", "RS", "SS", "M",
"C", "R", "STOT.SE", "STOT.RE", "AT", "Eco.Acute", "Eco.Chronic"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 12L, 13L,
14L, 17L, 18L, 19L, 20L, 21L, 22L, 28L, 34L), class = "data.frame")
#I define the order in which I look at the columns
orderA <- colnames(data)[2:16]
#no-yes function counts chemicals which meet condition Cn when condition Cn-1 is not met
count_no_yes <- function(data, cols) {
data <- data[, cols]
sum(apply(data, 1, function(x) all(x == 1)))
}
endpoints <- 0:15
#scenario A with order A of the columns
counts <- sapply(1:15, function(i) count_no_yes(data, orderA[1:i]))
counts <- c(nrow(data), counts)
scenarioA <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioA")
My problem is that I don't know how to include the information from the previous observation in my code. The current is not working. I get the following error: Error in apply(data, 1, function(x) all(x == 1)):dim(X) must have a positive length.
The idea is then to plot the number of observations that meet the conditions for each branch of the tree (column).
#scenario B with a different order of the columns
orderB <- colnames(data)[c(9, 10, 11, 5, 6, 8, 3, 2, 4, 13, 12, 7, 14, 15, 16)]
counts <- sapply(1:15, function(i) count_yes_yes(data, orderB[1:i]))
counts <- c(nrow(data), counts)
scenarioB <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioB")
#combine the different scenarios and plot
scenarios <- rbind(scenarioA, scenarioB)
library(ggplot2)
ggplot(scenarios, aes(x=endpoint, y=hits, color=scenario, group=scenario)) +
geom_point() +
geom_line()
Could it be this?
we tidy the data with tidy::gather then dplyr::group_by(par) and count the number of times a 0 is followed by a 1.
my.fun <- function(x) {
#Values
v <-rle(x)[[2]]
#Consecutive lenght
l <- rle(x)[[1]]
tmp <- data.frame(v = v, l=l)
tmp <-
tmp %>%
# for each column find a substance with
# 1 which came after a substance with value 0
# and check that 1 is followed by a zero
mutate(flag = ifelse(v==1 & lag(v)==0 & lead(v) == 0, 1, 0))
#return the sum of the `flag`value
sum(tmp$flag, na.rm = TRUE)
}
df %>%
tidyr::gather("par", "value", everything(), -Substance) %>%
group_by(par) %>%
summarise(c = my.fun(value))
# A tibble: 15 x 2
par c
<chr> <dbl>
1 AT 0
2 C 0
3 Dermal 0
4 Eco.Acute 1
5 Eco.Chronic 0
6 Inhalation 0
7 M 0
8 Oral 0
9 R 4
10 RS 1
11 SC 2
12 SED 1
13 SS 0
14 STOT.RE 4
15 STOT.SE 3
the rle function is a real gem for analyzing consecutiveness in a vector.
The my.fun can probably be adjusted to your exact needs.
data example
sommer=structure(list(tub = c(1L, 2L, 0L, 2L, 0L, 0L, 0L, 0L, 0L,
1L, 1L, 2L, 1L, 0L, 0L, 1L, 0L, 2L, 1L, 1L, 0L, 0L, 1L, 0L, 0L,
2L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 3L, 0L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 3L,
2L, 0L, 1L, 0L, 3L, 2L, 2L, 0L, 0L, 0L, 1L, 0L, 0L, 3L, 1L, 1L,
3L, 1L), fq = c(1L, 1L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 3L, 6L, 1L, 0L, 0L, 1L, 0L, 2L, 4L, 2L, 0L, 0L, 2L, 0L,
0L, 5L, 0L, 1L, 2L, 0L, 0L, 1L, 0L, 0L, 0L, 14L, 0L, 1L, 1L,
1L, 4L, 0L, 1L, 0L, 4L, 0L, 0L, 0L, 6L, 1L, 0L, 0L, 1L, 1L, 0L,
1L, 3L, 4L, 0L, 1L, 0L, 8L, 1L, 2L, 0L, 0L, 0L, 2L, 0L, 0L, 3L,
2L, 2L, 1L, 1L)), .Names = c("tub", "fq"), class = "data.frame", row.names = c(NA,
-75L))
i want calculate SomersDelta
library("DescTools")
SomersDelta(sommer, direction = c("row", "column"))
and i get the error
Error in as.table.default(x) : cannot coerce to a table
also i want get spine plot
library("coin")
spineplot(sommer)
but i get the error
Error in margin.table(tab, 1) : 'x' is not an array
are there two errors interconnection?
How to calculate SomersDelta and get spine plot like this
spine plot
It requires a matrix as input. According to ?SomersDelta
x - a numeric vector or a table. A matrix will be treated as table.
Here, the dataset is a data.frame class. We can convert it to matrix (as.matrix) and it should work fine
SomersDelta(as.matrix(sommer), direction = c("row", "column"))
#[1] -0.06137931
In my dataset, there are two group variables shop and art
here data example
read.csv(reg.csv)
structure(list(shop = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "c"), class = "factor"), art = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("b", "d"), class = "factor"),
Y = c(177L, 122L, 175L, 140L, 201L, 202L, 279L, 253L, 236L,
137L, 166L, 241L, 195L, 221L, 238L, 203L, 254L, 219L, 101L,
157L, 188L, 219L, 267L, 126L, 291L, 239L, 230L), x1 = c(1L,
0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L), x2 = c(0L, 1L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), x3 = c(0L, 0L, 0L,
1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L), x4 = c(0L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L), x5 = c(0L, 0L, 1L, 1L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 1L, 1L, 1L, 0L), x6 = c(0L, 1L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 1L), x7 = c(1L, 1L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 0L), x8 = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
0L, 1L, 0L, 1L), x9 = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L)), .Names = c("shop", "art", "Y", "x1", "x2",
"x3", "x4", "x5", "x6", "x7", "x8", "x9"), class = "data.frame", row.names = c(NA,
-27L))
I need perform regression analysis for all groups separately.
The formula is simple
mymodel=lm(y~.,data=reg)
I.e. i must perform analysis for a+b group and c+d group separately.
In this example we have only 2 groups(a+b and c+d)
where a,c-mean name of shop, and b,d -mean name of vendor code.
How can i perform regression ananysis separately by groups, cause in real data, there are several ten groups, manually divide on the datasets it's impossible.
This is a relatively common analytical pattern called split - apply - combine and it is fairly easy to perform with R:
library(tidyverse)
library(broom)
create a function for lm:
my_lm <- function(df) {
lm(Y ~ ., data = df)
}
run the models on nested groups of data:
df %>%
group_by(art, shop) %>%
nest() %>%
mutate(fit = map(data, my_lm),
tidy = map(fit, tidy)) %>%
select(-fit, - data) %>%
unnest()
First you group by the variables by the desired variables, fit the lm model to the groups use tidy to extract the coefficients, remove unwanted columns and then unnest. The result is:
#output
art shop term estimate std.error statistic p.value
<fctr> <fctr> <chr> <dbl> <dbl> <dbl> <dbl>
1 b a (Intercept) 31.0 269 0.115 0.927
2 b a x1 109 153 0.714 0.605
3 b a x2 - 23.0 223 -0.103 0.934
4 b a x3 - 15.0 185 -0.0810 0.949
5 b a x4 31.0 333 0.0931 0.941
6 b a x5 81.0 457 0.177 0.888
7 b a x6 77.0 162 0.475 0.718
8 b a x7 - 17.0 310 -0.0548 0.965
9 b a x8 - 15.0 214 -0.0700 0.956
10 b a x9 54.0 349 0.155 0.902
11 d c (Intercept) 199 98.8 2.01 0.0907
12 d c x1 - 15.7 60.8 -0.259 0.804
13 d c x2 5.98 48.8 0.123 0.906
14 d c x3 7.34 57.8 0.127 0.903
15 d c x4 - 20.1 53.8 -0.373 0.722
16 d c x5 - 43.2 41.8 -1.03 0.342
17 d c x6 1.93 34.5 0.0560 0.957
18 d c x7 31.9 40.5 0.787 0.461
19 d c x8 36.0 45.9 0.786 0.462
20 d c x9 10.7 49.7 0.215 0.837
There are many tutorials using the same or similar approach like the one I posted in my comment.
This question already has answers here:
Split a large dataframe into a list of data frames based on common value in column
(3 answers)
Closed 5 years ago.
I have a dataframe:
dput(test)
test <- structure(list(Blocking = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), Treatment = structure(c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L), .Label = c("A", "B", "C", "D"), class = "factor"),
ID69 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), ID70 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), ID71 = c(0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), ID72 = c(0L,
0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L), ID73 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
ID74 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), ID75 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), ID77 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L)), .Names = c("Blocking",
"Treatment", "ID69", "ID70", "ID71", "ID72", "ID73", "ID74",
"ID75", "ID77"), class = "data.frame", row.names = c(NA, -36L
))
I would like to split it based on treatment into four data frames. I tried the following code:
treatments <- c("A", "B", "C", "D")
subset_list <- lapply(treatments, function(x, input_df = test){
subset(input_dt, treatment=treatments)
})
names(subset_list) <- treatments
I get a list but individual data frames are the full test df's and do not contain rows only corresponding to the treatment. This seems like a really easy question, but I am missing a crucial detail here and I am new to the apply family. Please let me know, thanks!
You can use split():
split(test, treatments)
res <- split(x, f) divides the data into the groups defined by f into a list.
unsplit(res, f) does the opposite.
I have a data.frame X with column X and a data.frame C with M binary values (0/1). Both data.frames have N rows (examples).
I would like to average X on each case 0/1 of each m out of M column of C.
When I plot this, I accept to get M*2 bars where x axis are the column names of each column in C and red/blue is for when catergory m (out of M) is either 0/1.
Can this be done using ggplot2?
Any other quick way to do that without for loops?
Result sketch:
*
* * *
* * * *
m1=0, m1=1, m2=0, m2=1 ,....
Thanks,
Hanan
data sample below:
aggregate(X, by = as.list(C), FUN=mean) will aggregate to any combination of C. This is not what I want. I want X aggregated for every value of each column of C INDEPENDENTLY .
X<-structure(list(V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)),
.Names = "V1", class = "data.frame", row.names = c(NA, -100L))
C<-structure(list(V1 = c(1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L), V2 = c(1L, 0L, 1L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L
), V3 = c(1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 1L)),
.Names = c("V1", "V2", "V3"), class = "data.frame", row.names = c(NA, -100L))
Here is a way to transform your data broken down by incremental steps
dd <- do.call(rbind,
Map(function(a,b) cbind(C=a, b), names(C),
lapply(
lapply(
lapply(C, table, X[[1]], dnn=c("CV","X")),
as.data.frame),
subset, X==1)
))
So here we use table() to get the counts of each X value for each C value. Then we turn that into a data.frame and take only the counts for X=1. Finally we add the correct name of the C column and merge all the data.frames into one large data.frame.
Then we can plot that with
ggplot(dd, aes(x=C, y=Freq, fill=CV)) +
geom_bar(position="dodge", stat="identity")
So the columns of C are listed along the x-axis and the values of C are represented by the color of the bar. The counts of X=1 in each of the groups are the heights of the bars.