In R I have 2 datasets: group1 and group2.
For group 1 I have 10 game_id which is the id of a game, and we have number which is the numbers of times this games has been played in group1.
So if we type
group1
we get this output
game_id number
1 758565
2 235289
...
10 87084
For group2 we get
game_id number
1 79310
2 28564
...
10 9048
If I want to test if there is a statistical difference between group1 and group2 for the first 2 game_id I can use Pearson chi-square test.
In R I simply create the matrix
# The first 2 'numbers' in group1
a <- c( group1[1,2] , group1[2,2] )
# The first 2 'numbers' in group2
b <- c( group2[1,2], group2[2,2] )
# Creating it on matrix-form
m <- rbind(a,b)
So m gives us
a 758565 235289
b 79310 28564
Here I can test H: "a is independent from b", meaning that users in group1 play game_id 1 more than 2 compared to group2.
In R we type chisq.test(m) and we get a very low p-value meaning that we can reject H, meaning that a and b is not independent.
How should one find game_id's that are played significantly more in group1 than in group2 ?
I created a simpler version of only 3 games. I'm using a chi squared test and a proportions comparison test. Personally, I prefer the second one as it gives you an idea about what percentages you're comparing. Run the script and make sure you understand the process.
# dataset of group 1
dt_group1 = data.frame(game_id = 1:3,
number_games = c(758565,235289,87084))
dt_group1
# game_id number_games
# 1 1 758565
# 2 2 235289
# 3 3 87084
# add extra variables
dt_group1$number_rest_games = sum(dt_group1$number_games) - dt_group1$number_games # needed for chisq.test
dt_group1$number_all_games = sum(dt_group1$number_games) # needed for prop.test
dt_group1$Prc = dt_group1$number_games / dt_group1$number_all_games # just to get an idea about the percentages
dt_group1
# game_id number_games number_rest_games number_all_games Prc
# 1 1 758565 322373 1080938 0.70176550
# 2 2 235289 845649 1080938 0.21767113
# 3 3 87084 993854 1080938 0.08056336
# dataset of group 2
dt_group2 = data.frame(game_id = 1:3,
number_games = c(79310,28564,9048))
# add extra variables
dt_group2$number_rest_games = sum(dt_group2$number_games) - dt_group2$number_games
dt_group2$number_all_games = sum(dt_group2$number_games)
dt_group2$Prc = dt_group2$number_games / dt_group2$number_all_games
# input the game id you want to investigate
input_game_id = 1
# create a table of successes (games played) and failures (games not played)
dt_test = rbind(c(dt_group1$number_games[dt_group1$game_id==input_game_id], dt_group1$number_rest_games[dt_group1$game_id==input_game_id]),
c(dt_group2$number_games[dt_group2$game_id==input_game_id], dt_group2$number_rest_games[dt_group2$game_id==input_game_id]))
# perform chi sq test
chisq.test(dt_test)
# Pearson's Chi-squared test with Yates' continuity correction
#
# data: dt_test
# X-squared = 275.9, df = 1, p-value < 2.2e-16
# create a vector of successes (games played) and vector of total games
x = c(dt_group1$number_games[dt_group1$game_id==input_game_id], dt_group2$number_games[dt_group2$game_id==input_game_id])
y = c(dt_group1$number_all_games[dt_group1$game_id==input_game_id], dt_group2$number_all_games[dt_group2$game_id==input_game_id])
# perform test of proportions
prop.test(x,y)
# 2-sample test for equality of proportions with continuity correction
#
# data: x out of y
# X-squared = 275.9, df = 1, p-value < 2.2e-16
# alternative hypothesis: two.sided
# 95 percent confidence interval:
# 0.02063233 0.02626776
# sample estimates:
# prop 1 prop 2
# 0.7017655 0.6783155
The main thing is that chisq.test is a test that compares counts/proportions, so you need to provide the number of "successes" and "failures" for the groups you compare (contingency table as input). prop.test is another counts/proportions testing command that you need to provide the number of "successes" and "totals".
Now that you're happy with the result and you saw how the process works I'll add a more efficient way to perform those tests.
The first one is using dplyr and broom packages:
library(dplyr)
library(broom)
# dataset of group 1
dt_group1 = data.frame(game_id = 1:3,
number_games = c(758565,235289,87084),
group_id = 1) ## adding the id of the group
# dataset of group 2
dt_group2 = data.frame(game_id = 1:3,
number_games = c(79310,28564,9048),
group_id = 2) ## adding the id of the group
# combine datasets
dt = rbind(dt_group1, dt_group2)
dt %>%
group_by(group_id) %>% # for each group id
mutate(number_all_games = sum(number_games), # create new columns
number_rest_games = number_all_games - number_games,
Prc = number_games / number_all_games) %>%
group_by(game_id) %>% # for each game
do(tidy(prop.test(.$number_games, .$number_all_games))) %>% # perform the test
ungroup()
# game_id estimate1 estimate2 statistic p.value parameter conf.low conf.high
# (int) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
# 1 1 0.70176550 0.67831546 275.89973 5.876772e-62 1 0.020632330 0.026267761
# 2 2 0.21767113 0.24429962 435.44091 1.063385e-96 1 -0.029216006 -0.024040964
# 3 3 0.08056336 0.07738492 14.39768 1.479844e-04 1 0.001558471 0.004798407
The other one is using data.table and broom packages:
library(data.table)
library(broom)
# dataset of group 1
dt_group1 = data.frame(game_id = 1:3,
number_games = c(758565,235289,87084),
group_id = 1) ## adding the id of the group
# dataset of group 2
dt_group2 = data.frame(game_id = 1:3,
number_games = c(79310,28564,9048),
group_id = 2) ## adding the id of the group
# combine datasets
dt = data.table(rbind(dt_group1, dt_group2))
# create new columns for each group
dt[, number_all_games := sum(number_games), by=group_id]
dt[, `:=`(number_rest_games = number_all_games - number_games,
Prc = number_games / number_all_games) , by=group_id]
# for each game id compare percentages
dt[, tidy(prop.test(.SD$number_games, .SD$number_all_games)) , by=game_id]
# game_id estimate1 estimate2 statistic p.value parameter conf.low conf.high
# 1: 1 0.70176550 0.67831546 275.89973 5.876772e-62 1 0.020632330 0.026267761
# 2: 2 0.21767113 0.24429962 435.44091 1.063385e-96 1 -0.029216006 -0.024040964
# 3: 3 0.08056336 0.07738492 14.39768 1.479844e-04 1 0.001558471 0.004798407
You can see that each row represent one game and the comparison is between group 1 and 2. You can get the p values from the corresponding column, but other info of the test/comparison as well.
Related
I am having trouble trying to apply a custom function to multiple groups within a data frame and mutate it to the original data. I am trying to calculate the percent inhibition for each row of data (each observation in the experiment has a value). The challenging issue is that the function needs the mean of two different groups of values (positive and negative controls) and then uses that mean value in each calculation.
In other words, the mean of the negative control is subtracted by the experimental value, then divided by the mean of the negative control minus the positive control.
Each observation including the + and - controls should have a calculated percent inhibition, and as a double check, for each experiment(grouping) the
mean of the pct inhib of the - controls should be around 0 and the + controls around 100.
The function:
percent_inhibition <- function(uninhibited, inhibited, unknown){
uninhibited <- as.vector(uninhibited)
inhibited <- as.vector(inhibited)
unknown <- as.vector(unknown)
mu_u <- mean(uninhibited, na.rm = TRUE)
mu_i <- mean(inhibited, na.rm = TRUE)
percent_inhibition <- (mu_u - unknown)/(mu_u - mu_i)*100
return(percent_inhibition)
}
I have a data frame with multiple variables: target, box, replicate, and sample type. I am able to do the calculation by subsetting the data (below), (1 target, box, and replicate) but have not been able to figure out the right way to apply it to all of the data.
subset <- data %>%
filter(target == "A", box == "1", replicate == 1)
uninhib <-
subset$value[subset$sample == "unihib"]
inhib <-
subset$value[subset$sample == "inhib"]
pct <- subset %>%
mutate(pct = percent_inhibition(uninhib, inhib, .$value))
I have tried group_by and do, and nest functions, but my knowledge is lacking in how to apply these functions to my subsetting problem. I'm stuck when it comes to the subset of the subset (calculating the means) and then applying that to the individual values. I am hoping there is an elegant way to do this without all of the subsetting, but I am at a loss on how.
I have tried:
inhibition <- data %>%
group_by(target, box, replicate) %>%
mutate(pct = (percent_inhibition(.$value[.$sample == "uninhib"], .$value[.$sample == "inhib"], .$value)))
But get the error that columns are not the right length, because of the group_by function.
library(tidyr)
library(purrr)
library(dplyr)
data %>%
group_by(target, box, replicate) %>%
mutate(pct = {
x <- split(value, sample)
percent_inhibition(x$uninhib, x$inhib, value)
})
#> # A tibble: 10,000 x 6
#> # Groups: target, box, replicate [27]
#> target box replicate sample value pct
#> <chr> <chr> <int> <chr> <dbl> <dbl>
#> 1 A 1 3 inhib -0.836 1941.
#> 2 C 1 1 uninhib -0.221 -281.
#> 3 B 3 2 inhib -2.10 1547.
#> 4 C 1 1 uninhib -1.67 -3081.
#> 5 C 1 3 inhib -1.10 -1017.
#> 6 A 2 1 inhib -1.67 906.
#> 7 B 3 1 uninhib -0.0495 -57.3
#> 8 C 3 2 inhib 1.56 5469.
#> 9 B 3 2 uninhib -0.405 321.
#> 10 B 1 2 inhib 0.786 -3471.
#> # … with 9,990 more rows
Created on 2019-03-25 by the reprex package (v0.2.1)
Or:
data %>%
group_by(target, box, replicate) %>%
mutate(pct = percent_inhibition(value[sample == "uninhib"],
value[sample == "inhib"], value))
With data as:
n <- 10000L
set.seed(123) ; data <-
tibble(
target = sample(LETTERS[1:3], n, replace = TRUE),
box = sample(as.character(1:3), n, replace = TRUE),
replicate = sample(1:3, n, replace = TRUE),
sample = sample(c("inhib", "uninhib"), n, replace = TRUE),
value = rnorm(n)
)
I want to do a linear regression in R using the lm() function. My data is an annual time series with one field for year (22 years) and another for state (50 states). I want to fit a regression for each state so that at the end I have a vector of lm responses. I can imagine doing for loop for each state then doing the regression inside the loop and adding the results of each regression to a vector. That does not seem very R-like, however. In SAS I would do a 'by' statement and in SQL I would do a 'group by'. What's the R way of doing this?
Since 2009, dplyr has been released which actually provides a very nice way to do this kind of grouping, closely resembling what SAS does.
library(dplyr)
d <- data.frame(state=rep(c('NY', 'CA'), c(10, 10)),
year=rep(1:10, 2),
response=c(rnorm(10), rnorm(10)))
fitted_models = d %>% group_by(state) %>% do(model = lm(response ~ year, data = .))
# Source: local data frame [2 x 2]
# Groups: <by row>
#
# state model
# (fctr) (chr)
# 1 CA <S3:lm>
# 2 NY <S3:lm>
fitted_models$model
# [[1]]
#
# Call:
# lm(formula = response ~ year, data = .)
#
# Coefficients:
# (Intercept) year
# -0.06354 0.02677
#
#
# [[2]]
#
# Call:
# lm(formula = response ~ year, data = .)
#
# Coefficients:
# (Intercept) year
# -0.35136 0.09385
To retrieve the coefficients and Rsquared/p.value, one can use the broom package. This package provides:
three S3 generics: tidy, which summarizes a model's
statistical findings such as coefficients of a regression;
augment, which adds columns to the original data such as
predictions, residuals and cluster assignments; and glance, which
provides a one-row summary of model-level statistics.
library(broom)
fitted_models %>% tidy(model)
# Source: local data frame [4 x 6]
# Groups: state [2]
#
# state term estimate std.error statistic p.value
# (fctr) (chr) (dbl) (dbl) (dbl) (dbl)
# 1 CA (Intercept) -0.06354035 0.83863054 -0.0757668 0.9414651
# 2 CA year 0.02677048 0.13515755 0.1980687 0.8479318
# 3 NY (Intercept) -0.35135766 0.60100314 -0.5846187 0.5749166
# 4 NY year 0.09385309 0.09686043 0.9689519 0.3609470
fitted_models %>% glance(model)
# Source: local data frame [2 x 12]
# Groups: state [2]
#
# state r.squared adj.r.squared sigma statistic p.value df
# (fctr) (dbl) (dbl) (dbl) (dbl) (dbl) (int)
# 1 CA 0.004879969 -0.119510035 1.2276294 0.0392312 0.8479318 2
# 2 NY 0.105032068 -0.006838924 0.8797785 0.9388678 0.3609470 2
# Variables not shown: logLik (dbl), AIC (dbl), BIC (dbl), deviance (dbl),
# df.residual (int)
fitted_models %>% augment(model)
# Source: local data frame [20 x 10]
# Groups: state [2]
#
# state response year .fitted .se.fit .resid .hat
# (fctr) (dbl) (int) (dbl) (dbl) (dbl) (dbl)
# 1 CA 0.4547765 1 -0.036769875 0.7215439 0.4915464 0.3454545
# 2 CA 0.1217003 2 -0.009999399 0.6119518 0.1316997 0.2484848
# 3 CA -0.6153836 3 0.016771076 0.5146646 -0.6321546 0.1757576
# 4 CA -0.9978060 4 0.043541551 0.4379605 -1.0413476 0.1272727
# 5 CA 2.1385614 5 0.070312027 0.3940486 2.0682494 0.1030303
# 6 CA -0.3924598 6 0.097082502 0.3940486 -0.4895423 0.1030303
# 7 CA -0.5918738 7 0.123852977 0.4379605 -0.7157268 0.1272727
# 8 CA 0.4671346 8 0.150623453 0.5146646 0.3165112 0.1757576
# 9 CA -1.4958726 9 0.177393928 0.6119518 -1.6732666 0.2484848
# 10 CA 1.7481956 10 0.204164404 0.7215439 1.5440312 0.3454545
# 11 NY -0.6285230 1 -0.257504572 0.5170932 -0.3710185 0.3454545
# 12 NY 1.0566099 2 -0.163651479 0.4385542 1.2202614 0.2484848
# 13 NY -0.5274693 3 -0.069798386 0.3688335 -0.4576709 0.1757576
# 14 NY 0.6097983 4 0.024054706 0.3138637 0.5857436 0.1272727
# 15 NY -1.5511940 5 0.117907799 0.2823942 -1.6691018 0.1030303
# 16 NY 0.7440243 6 0.211760892 0.2823942 0.5322634 0.1030303
# 17 NY 0.1054719 7 0.305613984 0.3138637 -0.2001421 0.1272727
# 18 NY 0.7513057 8 0.399467077 0.3688335 0.3518387 0.1757576
# 19 NY -0.1271655 9 0.493320170 0.4385542 -0.6204857 0.2484848
# 20 NY 1.2154852 10 0.587173262 0.5170932 0.6283119 0.3454545
# Variables not shown: .sigma (dbl), .cooksd (dbl), .std.resid (dbl)
Here's an approach using the plyr package:
d <- data.frame(
state = rep(c('NY', 'CA'), 10),
year = rep(1:10, 2),
response= rnorm(20)
)
library(plyr)
# Break up d by state, then fit the specified model to each piece and
# return a list
models <- dlply(d, "state", function(df)
lm(response ~ year, data = df))
# Apply coef to each model and return a data frame
ldply(models, coef)
# Print the summary of each model
l_ply(models, summary, .print = TRUE)
Here's one way using the lme4 package.
library(lme4)
d <- data.frame(state=rep(c('NY', 'CA'), c(10, 10)),
year=rep(1:10, 2),
response=c(rnorm(10), rnorm(10)))
xyplot(response ~ year, groups=state, data=d, type='l')
fits <- lmList(response ~ year | state, data=d)
fits
#------------
Call: lmList(formula = response ~ year | state, data = d)
Coefficients:
(Intercept) year
CA -1.34420990 0.17139963
NY 0.00196176 -0.01852429
Degrees of freedom: 20 total; 16 residual
Residual standard error: 0.8201316
In my opinion is a mixed linear model a better approach for this kind of data. The code below given in the fixed effect the overall trend. The random effects indicate how the trend for each individual state differ from the global trend. The correlation structure takes the temporal autocorrelation into account. Have a look at Pinheiro & Bates (Mixed Effects Models in S and S-Plus).
library(nlme)
lme(response ~ year, random = ~year|state, correlation = corAR1(~year))
A nice solution using data.table was posted here in CrossValidated by #Zach.
I'd just add that it is possible to obtain iteratively also the regression coefficient r^2:
## make fake data
library(data.table)
set.seed(1)
dat <- data.table(x=runif(100), y=runif(100), grp=rep(1:2,50))
##calculate the regression coefficient r^2
dat[,summary(lm(y~x))$r.squared,by=grp]
grp V1
1: 1 0.01465726
2: 2 0.02256595
as well as all the other output from summary(lm):
dat[,list(r2=summary(lm(y~x))$r.squared , f=summary(lm(y~x))$fstatistic[1] ),by=grp]
grp r2 f
1: 1 0.01465726 0.714014
2: 2 0.02256595 1.108173
I think it's worthwhile to add the purrr::map approach to this problem.
library(tidyverse)
d <- data.frame(state=rep(c('NY', 'CA'), c(10, 10)),
year=rep(1:10, 2),
response=c(rnorm(10), rnorm(10)))
d %>%
group_by(state) %>%
nest() %>%
mutate(model = map(data, ~lm(response ~ year, data = .)))
See #Paul Hiemstra's answer for further ideas on using the broom package with these results.
I now my answer comes a bit late, but I was looking for a similar functionality. It would seem the built-in function 'by' in R can also do the grouping easily:
?by contains the following example, which fits per group and extracts the coefficients with sapply:
require(stats)
## now suppose we want to extract the coefficients by group
tmp <- with(warpbreaks,
by(warpbreaks, tension,
function(x) lm(breaks ~ wool, data = x)))
sapply(tmp, coef)
## make fake data
ngroups <- 2
group <- 1:ngroups
nobs <- 100
dta <- data.frame(group=rep(group,each=nobs),y=rnorm(nobs*ngroups),x=runif(nobs*ngroups))
head(dta)
#--------------------
group y x
1 1 0.6482007 0.5429575
2 1 -0.4637118 0.7052843
3 1 -0.5129840 0.7312955
4 1 -0.6612649 0.9028034
5 1 -0.5197448 0.1661308
6 1 0.4240346 0.8944253
#------------
## function to extract the results of one model
foo <- function(z) {
## coef and se in a data frame
mr <- data.frame(coef(summary(lm(y~x,data=z))))
## put row names (predictors/indep variables)
mr$predictor <- rownames(mr)
mr
}
## see that it works
foo(subset(dta,group==1))
#=========
Estimate Std..Error t.value Pr...t.. predictor
(Intercept) 0.2176477 0.1919140 1.134090 0.2595235 (Intercept)
x -0.3669890 0.3321875 -1.104765 0.2719666 x
#----------
## one option: use command by
res <- by(dta,dta$group,foo)
res
#=========
dta$group: 1
Estimate Std..Error t.value Pr...t.. predictor
(Intercept) 0.2176477 0.1919140 1.134090 0.2595235 (Intercept)
x -0.3669890 0.3321875 -1.104765 0.2719666 x
------------------------------------------------------------
dta$group: 2
Estimate Std..Error t.value Pr...t.. predictor
(Intercept) -0.04039422 0.1682335 -0.2401081 0.8107480 (Intercept)
x 0.06286456 0.3020321 0.2081387 0.8355526 x
## using package plyr is better
library(plyr)
res <- ddply(dta,"group",foo)
res
#----------
group Estimate Std..Error t.value Pr...t.. predictor
1 1 0.21764767 0.1919140 1.1340897 0.2595235 (Intercept)
2 1 -0.36698898 0.3321875 -1.1047647 0.2719666 x
3 2 -0.04039422 0.1682335 -0.2401081 0.8107480 (Intercept)
4 2 0.06286456 0.3020321 0.2081387 0.8355526 x
The lm() function above is an simple example. By the way, I imagine that your database has the columns as in the following form:
year state var1 var2 y...
In my point of view, you can to use the following code:
require(base)
library(base)
attach(data) # data = your data base
#state is your label for the states column
modell<-by(data, data$state, function(data) lm(y~I(1/var1)+I(1/var2)))
summary(modell)
The question seems to be about how to call regression functions with formulas which are modified inside a loop.
Here is how you can do it in (using diamonds dataset):
attach(ggplot2::diamonds)
strCols = names(ggplot2::diamonds)
formula <- list(); model <- list()
for (i in 1:1) {
formula[[i]] = paste0(strCols[7], " ~ ", strCols[7+i])
model[[i]] = glm(formula[[i]])
#then you can plot the results or anything else ...
png(filename = sprintf("diamonds_price=glm(%s).png", strCols[7+i]))
par(mfrow = c(2, 2))
plot(model[[i]])
dev.off()
}
I'm trying to add a column to a dataframe that gives the frequency of unique values in a character column. This is what I have so far:
term estimate std.error statistic p.value
1 (Intercept) 6.0888310 1.3601938 4.4764437 8.318542e-06
2 factor(age76)25 0.6884056 0.8861507 0.7768494 4.374021e-01
3 factor(age76)26 0.2177806 0.9997128 0.2178431 8.275887e-01
4 factor(age76)27 0.5539639 0.9255542 0.5985213 5.496061e-01
5 factor(age76)28 0.8705031 0.5343690 1.6290300 1.035716e-01
6 factor(age76)29 1.2249185 0.7557118 1.6208804 1.053084e-01
7 factor(age76)30 0.6254308 0.8861507 0.7057838 4.804608e-01
8 factor(age76)31 1.2295179 0.5343690 2.3008782 2.157089e-02
9 factor(age76)32 0.3032523 0.8449115 0.3589161 7.197216e-01
10 factor(age76)33 1.1344686 0.7557118 1.5011921 1.335714e-01
sapply(df.b, class)
term estimate std.error statistic p.value
"character" "numeric" "numeric" "numeric" "numeric"
library(dplyr)
df.b$n <- group_by(df.b$term) %>%
summarise(df.b$term, freq = n())
Error in UseMethod("group_by_") :
no applicable method for 'group_by_' applied to an object of class "character"
There seems to be a problem with the character type of my column. When I change it to numeric I am under the impression that it will change to NA.
dput(head(df.b))
structure(list(term = c("(Intercept)", "factor(age76)25", "factor(age76)26",
"factor(age76)27", "factor(age76)28", "factor(age76)29"), estimate = c(6.08883100125014,
0.688405615000334, 0.21778058000053, 0.553963930000528, 0.870503050000005,
1.22491850000015), std.error = c(1.36019381570938, 0.886150663575717,
0.999712776013908, 0.925554182033106, 0.534368956146369, 0.75571182509336
), statistic = c(4.47644367363531, 0.776849404166263, 0.217843149778352,
0.598521340785982, 1.62902998010529, 1.6208804193964), p.value = c(8.31854214736379e-06,
0.437402143453174, 0.827588701982869, 0.549606122411782, 0.103571567056818,
0.105308432290008)), .Names = c("term", "estimate", "std.error",
"statistic", "p.value"), row.names = c(NA, 6L), class = "data.frame")
I have also tried this but it gives a warning code:
df.b$n <- group_by(df.b, term)%>%
summarise(freq = n())
head(df.b)
term estimate std.error statistic p.value n
1 (Intercept) 6.0888310 1.3601938 4.4764437 8.318542e-06 # A tibble: 6 x 2
2 factor(age76)25 0.6884056 0.8861507 0.7768494 4.374021e-01 term freq
3 factor(age76)26 0.2177806 0.9997128 0.2178431 8.275887e-01 <chr> <int>
4 factor(age76)27 0.5539639 0.9255542 0.5985213 5.496061e-01 1 (Intercept) 1
5 factor(age76)28 0.8705031 0.5343690 1.6290300 1.035716e-01 2 factor(age76)25 1
6 factor(age76)29 1.2249185 0.7557118 1.6208804 1.053084e-01 3 factor(age76)25:factor(black)1 1
Warning message:
In format.data.frame(x, digits = digits, na.encode = FALSE) :
Korrupter Data Frame: Spalten werden abgeschnitten oder mit NAs aufgefüllt
I think you misunderstand the use of the key functions (group_by and summarise) in dplyr.
First of all, the output of these key functions is a data frame, not a vector. So you should not assign the output to df.b$n, a new column in the data frame.
Secondly, if you want to create a new column, use mutate. summarise it to summarise the group statistics, not to create a new column.
Thirdly, you may want to review how the pipe operation works (http://seananderson.ca/2014/09/13/dplyr-intro.html). The first argument of these key functions are all data frames. You should begin with df.b2 <- df.b %>% group_by(...) or df.b2 <- group_by(df.b, ...), where ... should be column names. In your original code, you use group_by(df.b$term) %>%
summarise(df.b$term, freq = n()) and leads to the error. This makes sense because group_by should take the first argument as a data frame, but you provided a character vector.
One final note, you may not show your entire data frame, but it seems like the elements in the term column are all unique, so the frequency count based on that column is probably all 1. Make sure this is what you want.
I modified your code a little bit as follows. Hopefully, the output df.b2 makes sense.
library(dplyr)
df.b2 <- df.b %>%
group_by(term) %>%
mutate(freq = n()) %>%
ungroup()
df.b2
# # A tibble: 6 x 6
# term estimate std.error statistic p.value freq
# <chr> <dbl> <dbl> <dbl> <dbl> <int>
# 1 (Intercept) 6.0888310 1.3601938 4.4764437 8.318542e-06 1
# 2 factor(age76)25 0.6884056 0.8861507 0.7768494 4.374021e-01 1
# 3 factor(age76)26 0.2177806 0.9997128 0.2178431 8.275887e-01 1
# 4 factor(age76)27 0.5539639 0.9255542 0.5985213 5.496061e-01 1
# 5 factor(age76)28 0.8705031 0.5343690 1.6290300 1.035716e-01 1
# 6 factor(age76)29 1.2249185 0.7557118 1.6208804 1.053084e-01 1
I want to find out deciles for each grouped variable. I am specifically looking for methods using dplyr and lapply. I'd appreciate if you can help me out.
Here's my what I tried. I don't know how to pull deciles directly other than calling dplyr::ntile() (which didn't work for me)
Attempt 1
Here's what I tried using describe() from Hmisc package:
set.seed(10)
IData <- data.frame(let = sample( x = LETTERS, size = 10000, replace=TRUE), numbers = sample(x = c(1:20000),size = 10000))
Output<-IData %>% data.table::as.data.table(.) %>% split(.,by=c("let"),drop = TRUE,sorted = TRUE) %>% purrr::map(~describe(.$numbers))
This certainly helps but there are two problems with above code:
a) The output (even the list format) is not something I am looking for.
b) I don't really know how to extract 5%, 10%...from the list above.
The bottomline is that I am stuck
Attempt 2
I tried replacing describe by ntile, but the following code gave me an output which didn't make sense to me because the number of columns aren't 10. Upon running Output[[1]], I see a vector of ~400 numbers instead of 10.
Output<-IData %>% data.table::as.data.table(.) %>% split(.,by=c("let"),drop = TRUE,sorted = TRUE) %>% purrr::map(~dplyr::ntile(.$numbers,10))
Attempt 3 = Expected Output
Finally, I tried going the old school (i.e. copy-paste) to get the expected output:
Output<-IData %>%
dplyr::group_by(let) %>%
dplyr::summarise( QQuantile1 = quantile(`numbers`, c(.10)),
QQuantile1 = quantile(`numbers`, c(.10)),
QQuantile2 = quantile(`numbers`, c(.20)),
QQuantile3 = quantile(`numbers`, c(.30)),
QQuantile4 = quantile(`numbers`, c(.40)),
QQuantile5 = quantile(`numbers`, c(.50)),
QQuantile6 = quantile(`numbers`, c(.60)),
QQuantile7 = quantile(`numbers`, c(.70)),
QQuantile8 = quantile(`numbers`, c(.80)),
QQuantile9 = quantile(`numbers`, c(.90)),
QQuantile10 = quantile(`numbers`, c(.100)))
Question: Can someone please help me to generate above output by using these three (not one, but preferably all the methods for learning)
1) lapply
2) dplyr
3) data.table
I looked at several threads on SO, but they all talk about a specific quantile and not all of them. E.g. Find top deciles from dataframe by group thread.
To assemble my comments into an answer, base is shockingly simple:
aggregate(numbers ~ let, IData, quantile, seq(0.1, 1, 0.1))
## let numbers.10% numbers.20% numbers.30% numbers.40% numbers.50% numbers.60% numbers.70% numbers.80% ...
## 1 A 1749.8 3847.8 5562.6 7475.2 9926.0 11758.6 13230.6 15788.8
## 2 B 2393.5 4483.6 6359.1 7708.0 9773.0 11842.8 13468.9 16266.4
## 3 C 2041.5 3682.0 5677.5 7504.0 9226.0 11470.0 13628.5 15379.0
## 4 D 1890.7 4086.8 5661.9 7526.6 9714.0 11438.8 13969.2 15967.2
## 5 E 2083.6 4107.0 6179.8 7910.8 10095.0 11692.6 13668.0 15570.2
## 6 F 1936.6 4220.2 6197.0 8791.8 10382.0 12266.4 14589.2 16407.0
## 7 G 3059.4 4884.2 6519.6 8530.0 10481.0 12469.0 14401.6 16127.8
## 8 H 2186.5 4081.0 5801.5 7206.0 9256.5 11453.0 13692.0 15471.0
## 9 I 1534.1 3793.2 5822.2 7621.4 9417.5 11737.0 14191.2 15722.4
## 10 J 1967.2 4286.6 5829.6 7664.6 10606.0 12217.4 14422.2 16628.0
## ...
with the caveat that numbers is actually a nested column that may need to be unpacked for further usage.
dplyr works if you use list columns or do and reshape:
library(tidyverse)
IData %>% group_by(let) %>%
summarise(quant_prob = list(paste0('quant', seq(.1, 1, .1))),
quant_value = list(quantile(numbers, seq(.1, 1, .1)))) %>%
unnest() %>%
spread(quant_prob, quant_value)
## # A tibble: 26 × 11
## let quant0.1 quant0.2 quant0.3 quant0.4 quant0.5 quant0.6 quant0.7 quant0.8 quant0.9 quant1
## * <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 1749.8 3847.8 5562.6 7475.2 9926.0 11758.6 13230.6 15788.8 17763.0 19958
## 2 B 2393.5 4483.6 6359.1 7708.0 9773.0 11842.8 13468.9 16266.4 17877.4 19929
## 3 C 2041.5 3682.0 5677.5 7504.0 9226.0 11470.0 13628.5 15379.0 17265.0 19876
## 4 D 1890.7 4086.8 5661.9 7526.6 9714.0 11438.8 13969.2 15967.2 17961.0 19989
## 5 E 2083.6 4107.0 6179.8 7910.8 10095.0 11692.6 13668.0 15570.2 18011.4 19887
## 6 F 1936.6 4220.2 6197.0 8791.8 10382.0 12266.4 14589.2 16407.0 18345.0 19997
## 7 G 3059.4 4884.2 6519.6 8530.0 10481.0 12469.0 14401.6 16127.8 18219.2 19922
## 8 H 2186.5 4081.0 5801.5 7206.0 9256.5 11453.0 13692.0 15471.0 17331.0 19996
## 9 I 1534.1 3793.2 5822.2 7621.4 9417.5 11737.0 14191.2 15722.4 17706.6 19965
## 10 J 1967.2 4286.6 5829.6 7664.6 10606.0 12217.4 14422.2 16628.0 18091.2 19901
## # ... with 16 more rows
Another interesting option is purrrlyr::by_slice, which lets you collect the results to columns:
IData %>% group_by(let) %>%
by_slice(~quantile(.x$numbers, seq(0.1, 1, 0.1)), .collate = "cols")
## # A tibble: 26 × 11
## let .out1 .out2 .out3 .out4 .out5 .out6 .out7 .out8 .out9 .out10
## <fctr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 A 1749.8 3847.8 5562.6 7475.2 9926.0 11758.6 13230.6 15788.8 17763.0 19958
## 2 B 2393.5 4483.6 6359.1 7708.0 9773.0 11842.8 13468.9 16266.4 17877.4 19929
## 3 C 2041.5 3682.0 5677.5 7504.0 9226.0 11470.0 13628.5 15379.0 17265.0 19876
## 4 D 1890.7 4086.8 5661.9 7526.6 9714.0 11438.8 13969.2 15967.2 17961.0 19989
## 5 E 2083.6 4107.0 6179.8 7910.8 10095.0 11692.6 13668.0 15570.2 18011.4 19887
## 6 F 1936.6 4220.2 6197.0 8791.8 10382.0 12266.4 14589.2 16407.0 18345.0 19997
## 7 G 3059.4 4884.2 6519.6 8530.0 10481.0 12469.0 14401.6 16127.8 18219.2 19922
## 8 H 2186.5 4081.0 5801.5 7206.0 9256.5 11453.0 13692.0 15471.0 17331.0 19996
## 9 I 1534.1 3793.2 5822.2 7621.4 9417.5 11737.0 14191.2 15722.4 17706.6 19965
## 10 J 1967.2 4286.6 5829.6 7664.6 10606.0 12217.4 14422.2 16628.0 18091.2 19901
## # ... with 16 more rows
though the column names are a little lousy.
We can do this in a compact way with data.table. Convert the 'data.frame' to 'data.table' (setDT(IData)), grouped by 'let', get the quantile of 'numbers' and convert it to list (as.list)
library(data.table)
setDT(IData)[, as.list(quantile(numbers, seq(.1, 1, .1))), by = let]
I have a dataframe which looks like this
> head(data)
LH3003 LH3004 LH3005 LH3006 LH3007 LH3008 LH3009 LH3010 LH3011
cg18478105 0.02329879 0.08103364 0.01611778 0.01691191 0.01886975 0.01885553 0.01647439 0.02120779 0.01168622
cg14361672 0.09479536 0.07821380 0.02522833 0.06467310 0.05387729 0.05866673 0.08121820 0.10920162 0.04413263
cg01763666 0.03625680 0.04633759 0.04401555 0.08371531 0.09866403 0.17611284 0.07306743 0.12422579 0.11125146
cg02115394 0.10014794 0.09274320 0.08743445 0.08906313 0.09934032 0.18164115 0.06526380 0.08158144 0.08862067
cg13417420 0.01811630 0.02221060 0.01314041 0.01964530 0.02367295 0.01209913 0.01612864 0.01306061 0.04421938
cg26724186 0.32776266 0.31386294 0.24167480 0.29036142 0.24751268 0.26894756 0.20927278 0.28070790 0.33188921
LH3012 LH3013 LH3014
cg18478105 0.02466508 0.01909706 0.02054417
cg14361672 0.09172160 0.06170230 0.07752691
cg01763666 0.04328518 0.13693868 0.04288165
cg02115394 0.08682942 0.08601880 0.12413149
cg13417420 0.01980470 0.02241745 0.02038114
cg26724186 0.30832389 0.27644816 0.37630038
with almost 850000 rows,
and a different dataframe which contains the information behind the sample names:
> variables
Sample_ID Name Group01
3 LH3003 pair1 0
4 LH3004 pair1 1
5 LH3005 pair2 0
6 LH3006 pair2 1
7 LH3007 pair3 0
8 LH3008 pair3 1
9 LH3009 pair4 0
10 LH3010 pair4 1
11 LH3011 pair5 0
12 LH3012 pair5 1
13 LH3013 pair6 0
14 LH3014 pair6 1
Is it possible to do a paired t-test by defining the pairs and the group annotation of the samples based on another dataframe?
Thank you for your help!
Here is an lapply method that will store the results of each test in a list. This assumes that each pair is adjacent in the second data.frame,df2 and the first data.frame is named df1.
myTestList <- lapply(seq(1, nrow(df2), 2), function(i)
t.test(df1[[df2$Sample_ID[i]]], df1[[df2$Sample_ID[i+1]]], paired=TRUE))
which returns
myTestList
[[1]]
Paired t-test
data: df1[[df2$Sample_ID[i]]] and df1[[df2$Sample_ID[i + 1]]]
t = -0.50507, df = 5, p-value = 0.635
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.03453201 0.02319070
sample estimates:
mean of the differences
-0.005670653
[[2]]
Paired t-test
data: df1[[df2$Sample_ID[i]]] and df1[[df2$Sample_ID[i + 1]]]
t = -2.5322, df = 5, p-value = 0.05239
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.0459320947 0.0003458114
sample estimates:
mean of the differences
-0.02279314
data
df1 <- read.table(header=TRUE, text="LH3003 LH3004 LH3005 LH3006 LH3007 LH3008 LH3009 LH3010 LH3011
cg18478105 0.02329879 0.08103364 0.01611778 0.01691191 0.01886975 0.01885553 0.01647439 0.02120779 0.01168622
cg14361672 0.09479536 0.07821380 0.02522833 0.06467310 0.05387729 0.05866673 0.08121820 0.10920162 0.04413263
cg01763666 0.03625680 0.04633759 0.04401555 0.08371531 0.09866403 0.17611284 0.07306743 0.12422579 0.11125146
cg02115394 0.10014794 0.09274320 0.08743445 0.08906313 0.09934032 0.18164115 0.06526380 0.08158144 0.08862067
cg13417420 0.01811630 0.02221060 0.01314041 0.01964530 0.02367295 0.01209913 0.01612864 0.01306061 0.04421938
cg26724186 0.32776266 0.31386294 0.24167480 0.29036142 0.24751268 0.26894756 0.20927278 0.28070790 0.33188921")[1:4]
df2 <- read.table(header=TRUE, text=" Sample_ID Name Group01
3 LH3003 pair1 0
4 LH3004 pair1 1
5 LH3005 pair2 0
6 LH3006 pair2 1")
You need to stack your data and define a pair column and then run the t.test, this is for 1 of the 6 tests:
data2 <- data.frame(x = c(data$LH3003, data$LH3004), pair = c(rep(0, nrow(data)), rep(1, nrow(data))))
t.test(x ~ pair, data2)
Here's a variation on #Imo's:
lapply(unique(df2$Name), function(x){
samples <- df2[df2$Name==x,1]
t.test(df1[,samples[1]], df1[,samples[2]], paired=T)
})