Regression analysis with separateing group in R - r

In my dataset, there are two group variables shop and art
here data example
read.csv(reg.csv)
structure(list(shop = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L), .Label = c("a", "c"), class = "factor"), art = structure(c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 2L, 2L,
2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L, 2L), .Label = c("b", "d"), class = "factor"),
Y = c(177L, 122L, 175L, 140L, 201L, 202L, 279L, 253L, 236L,
137L, 166L, 241L, 195L, 221L, 238L, 203L, 254L, 219L, 101L,
157L, 188L, 219L, 267L, 126L, 291L, 239L, 230L), x1 = c(1L,
0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L), x2 = c(0L, 1L,
1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L), x3 = c(0L, 0L, 0L,
1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 0L,
1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L), x4 = c(0L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 1L, 1L), x5 = c(0L, 0L, 1L, 1L, 0L,
0L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L, 0L, 0L, 1L, 1L, 1L, 0L), x6 = c(0L, 1L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 1L), x7 = c(1L, 1L, 0L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 0L), x8 = c(0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L,
1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 1L,
0L, 1L, 0L, 1L), x9 = c(1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 1L,
0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L,
1L, 1L, 0L)), .Names = c("shop", "art", "Y", "x1", "x2",
"x3", "x4", "x5", "x6", "x7", "x8", "x9"), class = "data.frame", row.names = c(NA,
-27L))
I need perform regression analysis for all groups separately.
The formula is simple
mymodel=lm(y~.,data=reg)
I.e. i must perform analysis for a+b group and c+d group separately.
In this example we have only 2 groups(a+b and c+d)
where a,c-mean name of shop, and b,d -mean name of vendor code.
How can i perform regression ananysis separately by groups, cause in real data, there are several ten groups, manually divide on the datasets it's impossible.

This is a relatively common analytical pattern called split - apply - combine and it is fairly easy to perform with R:
library(tidyverse)
library(broom)
create a function for lm:
my_lm <- function(df) {
lm(Y ~ ., data = df)
}
run the models on nested groups of data:
df %>%
group_by(art, shop) %>%
nest() %>%
mutate(fit = map(data, my_lm),
tidy = map(fit, tidy)) %>%
select(-fit, - data) %>%
unnest()
First you group by the variables by the desired variables, fit the lm model to the groups use tidy to extract the coefficients, remove unwanted columns and then unnest. The result is:
#output
art shop term estimate std.error statistic p.value
<fctr> <fctr> <chr> <dbl> <dbl> <dbl> <dbl>
1 b a (Intercept) 31.0 269 0.115 0.927
2 b a x1 109 153 0.714 0.605
3 b a x2 - 23.0 223 -0.103 0.934
4 b a x3 - 15.0 185 -0.0810 0.949
5 b a x4 31.0 333 0.0931 0.941
6 b a x5 81.0 457 0.177 0.888
7 b a x6 77.0 162 0.475 0.718
8 b a x7 - 17.0 310 -0.0548 0.965
9 b a x8 - 15.0 214 -0.0700 0.956
10 b a x9 54.0 349 0.155 0.902
11 d c (Intercept) 199 98.8 2.01 0.0907
12 d c x1 - 15.7 60.8 -0.259 0.804
13 d c x2 5.98 48.8 0.123 0.906
14 d c x3 7.34 57.8 0.127 0.903
15 d c x4 - 20.1 53.8 -0.373 0.722
16 d c x5 - 43.2 41.8 -1.03 0.342
17 d c x6 1.93 34.5 0.0560 0.957
18 d c x7 31.9 40.5 0.787 0.461
19 d c x8 36.0 45.9 0.786 0.462
20 d c x9 10.7 49.7 0.215 0.837
There are many tutorials using the same or similar approach like the one I posted in my comment.

Related

How to write a function to count the number of observations based on specific conditions in R?

I have a data frame of 1401 observations of 16 variables. For each column (except the first one), I have either 1 (if a condition is met) or 0 (if a condition is not met). Overall, the idea is to count how many observations meet certain conditions successively. We can think about it as a decision tree: in the first branch you can have either 1 (condition is met) or 0 (condition is not met), in the second branch starting from the 0 of the first branch, you can also have 1 or 0, etc... In my data frame, branches are columns. I want to investigate the impact of looking at the different branches (columns) in various orders.
My idea is to count the number of "1" in column Cn if I know that there was a "0" in column Cn-1.
dput(droplevels(head(data,20)))
structure(list(Substance = structure(c(13L, 9L, 10L, 12L, 1L,
19L, 16L, 17L, 5L, 2L, 14L, 7L, 4L, 6L, 20L, 18L, 15L, 3L, 11L,
8L), .Label = c("104653-34-1", "107-02-8", "111-30-8", "12057-74-8",
"122454-29-9", "14915-37-8", "20859-73-8", "27083-27-8", "28772-56-7",
"3691-35-8", "55965-84-9", "56073-07-5", "56073-10-0", "5836-29-3",
"71751-41-2", "74-90-8", "81-81-2", "86347-14-0", "90035-08-8",
"91465-08-6"), class = "factor"), colA = c(1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
colB = c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L), colC = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L), colD = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L), colE = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 1L), colF = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L), colG = c(0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L,
1L), colH = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), colI = c(0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L
), colK = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L), colJ = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 1L, 0L, 1L,
0L, 0L), colL = c(1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L), colM = c(NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_, NA_integer_,
NA_integer_, NA_integer_, NA_integer_, NA_integer_), colN = c(1L,
1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L), colO = c(1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L)), .Names = c("Substance",
"Oral", "Dermal", "Inhalation", "SC", "SED", "RS", "SS", "M",
"C", "R", "STOT.SE", "STOT.RE", "AT", "Eco.Acute", "Eco.Chronic"
), row.names = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 9L, 10L, 12L, 13L,
14L, 17L, 18L, 19L, 20L, 21L, 22L, 28L, 34L), class = "data.frame")
#I define the order in which I look at the columns
orderA <- colnames(data)[2:16]
#no-yes function counts chemicals which meet condition Cn when condition Cn-1 is not met
count_no_yes <- function(data, cols) {
data <- data[, cols]
sum(apply(data, 1, function(x) all(x == 1)))
}
endpoints <- 0:15
#scenario A with order A of the columns
counts <- sapply(1:15, function(i) count_no_yes(data, orderA[1:i]))
counts <- c(nrow(data), counts)
scenarioA <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioA")
My problem is that I don't know how to include the information from the previous observation in my code. The current is not working. I get the following error: Error in apply(data, 1, function(x) all(x == 1)):dim(X) must have a positive length.
The idea is then to plot the number of observations that meet the conditions for each branch of the tree (column).
#scenario B with a different order of the columns
orderB <- colnames(data)[c(9, 10, 11, 5, 6, 8, 3, 2, 4, 13, 12, 7, 14, 15, 16)]
counts <- sapply(1:15, function(i) count_yes_yes(data, orderB[1:i]))
counts <- c(nrow(data), counts)
scenarioB <- data.frame(endpoint=endpoints, hits=counts, scenario="scenarioB")
#combine the different scenarios and plot
scenarios <- rbind(scenarioA, scenarioB)
library(ggplot2)
ggplot(scenarios, aes(x=endpoint, y=hits, color=scenario, group=scenario)) +
geom_point() +
geom_line()
Could it be this?
we tidy the data with tidy::gather then dplyr::group_by(par) and count the number of times a 0 is followed by a 1.
my.fun <- function(x) {
#Values
v <-rle(x)[[2]]
#Consecutive lenght
l <- rle(x)[[1]]
tmp <- data.frame(v = v, l=l)
tmp <-
tmp %>%
# for each column find a substance with
# 1 which came after a substance with value 0
# and check that 1 is followed by a zero
mutate(flag = ifelse(v==1 & lag(v)==0 & lead(v) == 0, 1, 0))
#return the sum of the `flag`value
sum(tmp$flag, na.rm = TRUE)
}
df %>%
tidyr::gather("par", "value", everything(), -Substance) %>%
group_by(par) %>%
summarise(c = my.fun(value))
# A tibble: 15 x 2
par c
<chr> <dbl>
1 AT 0
2 C 0
3 Dermal 0
4 Eco.Acute 1
5 Eco.Chronic 0
6 Inhalation 0
7 M 0
8 Oral 0
9 R 4
10 RS 1
11 SC 2
12 SED 1
13 SS 0
14 STOT.RE 4
15 STOT.SE 3
the rle function is a real gem for analyzing consecutiveness in a vector.
The my.fun can probably be adjusted to your exact needs.

How to compute on columns pairwise, from two groups with dplyr

I have a dataset of this shape.
group a1 a2 ... a9 b1 b2 ... b7
1 1 0 ... 1 0 1 ... 1
1 1 1 ... 1 0 0 ... 1
1 0 0 ... 0 1 0 ... 1
1 1 1 ... 0 1 1 ... 0
2 1 0 ... 1 0 1 ... 1
2 1 1 ... 1 0 0 ... 1
2 0 0 ... 0 1 0 ... 1
2 1 1 ... 0 1 1 ... 0
...
and what I'd like to do is apply a two-argument summary function to all pairs of columns, maintaining the grouped nature of the data.
So, for example
f = function(a, b) { mean(a) + mean(b) + mean(a & b) }
would return something like (I'm not actually going to compute the value of the function, I'll just put "x" to indicate where the stat would go, but of course it would be different for each group-a-b combination).
group a_col b_col stat
1 a1 b1 x
1 a1 b2 x
1 a1 b3 x
...
1 a9 b7 x
2 a1 b1 x
...
A commenter asked for some sample data. Here it is:
structure(list(group = c(1L, 1L, 1L, 1L, 2L, 2L, 2L, 2L, 3L,
3L, 3L, 4L, 4L, 4L, 4L, 5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L,
7L, 7L, 7L, 7L, 7L, 7L, 8L, 8L, 8L, 9L, 10L, 10L), a1 = c(0L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 0L,
1L, 0L, 0L, 0L), a2 = c(0L, 0L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L), a3 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L,
0L, 0L), a4 = c(0L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 0L, 0L, 1L, 1L, 0L, 0L, 0L, 0L), a5 = c(1L, 0L, 0L, 0L, 0L,
0L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
0L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L, 1L, 0L, 0L
), b1 = c(1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L,
0L, 0L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 0L,
0L, 1L, 0L, 1L, 1L, 0L, 0L, 0L), b2 = c(0L, 0L, 1L, 0L, 0L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 0L, 0L, 0L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L),
b3 = c(0L, 1L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 0L, 0L,
1L, 1L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L,
1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)), class = "data.frame", row.names = c(NA,
-37L))
A solution using tidyverse. We can gather the columns based on the starting letters twice and then conduct the operation. Assuming your data is called dat, dat2 is the final output.
library(tidyverse)
dat2 <- dat %>%
gather(column_a, value_a, starts_with("a")) %>%
gather(column_b, value_b, starts_with("b")) %>%
group_by(group, column_a, column_b) %>%
summarise(stat = mean(value_a) + mean(value_b) + mean(value_a + value_b)) %>%
ungroup()
dat2
# # A tibble: 150 x 4
# group column_a column_b stat
# <int> <chr> <chr> <dbl>
# 1 1 a1 b1 3
# 2 1 a1 b2 2
# 3 1 a1 b3 2
# 4 1 a2 b1 2
# 5 1 a2 b2 1
# 6 1 a2 b3 1
# 7 1 a3 b1 3.5
# 8 1 a3 b2 2.5
# 9 1 a3 b3 2.5
# 10 1 a4 b1 2
# # ... with 140 more rows

Express relations between three variables using ggplot2 in R

I have a data frame like this
structure(list(cli_exp = c(1L, 1L, 2L, 1L, 1L, 0L, 2L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 2L, 2L, 0L, 1L, 0L,
1L, 1L, 2L, 0L, 1L), vcs_exp = c(0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 2L, 1L,
1L, 0L, 0L, 0L, 2L, 1L, 0L), web_exp = c(2L, 2L, 2L, 1L, 0L,
0L, 1L, 2L, 0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 2L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 2L, 0L, 0L)), .Names = c("cli_exp", "vcs_exp",
"web_exp"), row.names = c(NA, 30L), class = "data.frame")
I want to use ggplot2 to express the relation between these three variables and tried the simple point plot
ggplot(data = data) +
geom_point(mapping = aes(x = web_exp, y = vcs_exp, color = cli_exp))
But apparently, there are many overlapping data points, which are not suitable for point display. Are there any better ways?
I would use ggpairs from GGally package
tmp_df <- structure(list(cli_exp = c(1L, 1L, 2L, 1L, 1L, 0L, 2L, 0L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 2L, 2L, 0L, 1L, 0L,
1L, 1L, 2L, 0L, 1L), vcs_exp = c(0L, 0L, 1L, 0L, 0L, 0L, 0L,
1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 2L, 1L,
1L, 0L, 0L, 0L, 2L, 1L, 0L), web_exp = c(2L, 2L, 2L, 1L, 0L,
0L, 1L, 2L, 0L, 0L, 3L, 0L, 1L, 0L, 1L, 0L, 1L, 0L, 2L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 2L, 0L, 0L)), .Names = c("cli_exp", "vcs_exp",
"web_exp"), row.names = c(NA, 30L), class = "data.frame")
library(GGally)
ggpairs(tmp_df,
upper = list(continuous = wrap("cor", size = 10)),
lower = list(continuous = "smooth"))
Edit: use pairs from base R
pairs(tmp_df)
Use pairs.panels from psych package
library(psych)
pairs.panels(tmp_df,
method = "pearson",
density = TRUE,
ellipses = TRUE
)
As you mentioned, the points overlap, so some points aren't visible when using geom_point.
ggplot(data = df, aes(x = web_exp, y = vcs_exp, color = cli_exp)) +
geom_point()
This can be solved by adding a small amount of jitter. Also, making the points slightly transparent will make any overlaps more clear.
ggplot(data = df, aes(x = web_exp, y = vcs_exp, color = cli_exp)) +
geom_jitter(width = 0.05, height = 0.05, alpha = 0.8)

subset a data frame by group [duplicate]

This question already has answers here:
Split a large dataframe into a list of data frames based on common value in column
(3 answers)
Closed 5 years ago.
I have a dataframe:
dput(test)
test <- structure(list(Blocking = c(1L, 1L, 1L, 1L, 1L, 1L, 2L, 2L, 2L,
2L, 2L, 2L, 3L, 3L, 3L, 3L, 3L, 3L, 4L, 4L, 4L, 4L, 4L, 4L, 5L,
5L, 5L, 5L, 5L, 5L, 6L, 6L, 6L, 6L, 6L, 6L), Treatment = structure(c(1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L, 2L, 3L, 4L, 1L,
2L, 3L, 4L), .Label = c("A", "B", "C", "D"), class = "factor"),
ID69 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), ID70 = c(1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L), ID71 = c(0L, 0L, 1L, 1L, 1L, 0L, 0L, 0L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L), ID72 = c(0L,
0L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 1L, 0L), ID73 = c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L),
ID74 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L), ID75 = c(0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L), ID77 = c(0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 0L)), .Names = c("Blocking",
"Treatment", "ID69", "ID70", "ID71", "ID72", "ID73", "ID74",
"ID75", "ID77"), class = "data.frame", row.names = c(NA, -36L
))
I would like to split it based on treatment into four data frames. I tried the following code:
treatments <- c("A", "B", "C", "D")
subset_list <- lapply(treatments, function(x, input_df = test){
subset(input_dt, treatment=treatments)
})
names(subset_list) <- treatments
I get a list but individual data frames are the full test df's and do not contain rows only corresponding to the treatment. This seems like a really easy question, but I am missing a crucial detail here and I am new to the apply family. Please let me know, thanks!
You can use split():
split(test, treatments)
res <- split(x, f) divides the data into the groups defined by f into a list.
unsplit(res, f) does the opposite.

plotting mean of variable versus matrix of conditions in R using ggplot2

I have a data.frame X with column X and a data.frame C with M binary values (0/1). Both data.frames have N rows (examples).
I would like to average X on each case 0/1 of each m out of M column of C.
When I plot this, I accept to get M*2 bars where x axis are the column names of each column in C and red/blue is for when catergory m (out of M) is either 0/1.
Can this be done using ggplot2?
Any other quick way to do that without for loops?
Result sketch:
*
* * *
* * * *
m1=0, m1=1, m2=0, m2=1 ,....
Thanks,
Hanan
data sample below:
aggregate(X, by = as.list(C), FUN=mean) will aggregate to any combination of C. This is not what I want. I want X aggregated for every value of each column of C INDEPENDENTLY .
X<-structure(list(V1 = c(0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L,
0L, 0L, 0L, 1L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L, 0L)),
.Names = "V1", class = "data.frame", row.names = c(NA, -100L))
C<-structure(list(V1 = c(1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L,
1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L), V2 = c(1L, 0L, 1L, 0L,
1L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L,
1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L,
1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L,
0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 1L,
0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L
), V3 = c(1L, 0L, 1L, 0L, 1L, 1L, 0L, 0L, 1L, 0L, 0L, 1L, 1L,
1L, 1L, 0L, 1L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L,
0L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 0L, 1L, 1L, 1L, 1L,
1L, 0L, 1L, 0L, 0L, 1L, 1L, 0L, 1L, 0L, 0L, 1L, 1L, 1L, 1L, 1L,
0L, 1L, 0L, 1L, 1L, 1L, 1L, 0L, 0L, 1L, 1L, 1L, 0L, 1L, 1L, 1L,
1L, 1L, 0L, 0L, 1L, 1L, 1L)),
.Names = c("V1", "V2", "V3"), class = "data.frame", row.names = c(NA, -100L))
Here is a way to transform your data broken down by incremental steps
dd <- do.call(rbind,
Map(function(a,b) cbind(C=a, b), names(C),
lapply(
lapply(
lapply(C, table, X[[1]], dnn=c("CV","X")),
as.data.frame),
subset, X==1)
))
So here we use table() to get the counts of each X value for each C value. Then we turn that into a data.frame and take only the counts for X=1. Finally we add the correct name of the C column and merge all the data.frames into one large data.frame.
Then we can plot that with
ggplot(dd, aes(x=C, y=Freq, fill=CV)) +
geom_bar(position="dodge", stat="identity")
So the columns of C are listed along the x-axis and the values of C are represented by the color of the bar. The counts of X=1 in each of the groups are the heights of the bars.

Resources