T.test between two data sets - row by row - r

I think that title explains everything. I would like to do t.test between two data sets. I would like to compare row by row.
Let's use mtcars for that and slightly modified mtcars_mod.
structure(list(mpg = c(21, 25, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 24.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
36.4, 31.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
29.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 7, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 6, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 15, 97, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 7, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 6, 2.78), qsec = c(16.46, 17.02, 18.61,
114, 17.02, 20.22, 15.84, 12, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
32, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame"
I tried to do it in the loop but I don't know how to store the results. I get only the last value...
for(z in 1:nrow(mtcars)){
vec_1 <- mtcars[z,1:7]
vec_2 <- mtcars_mod[z,1:7]
vec_results <- unlist(t.test(vec_1, vec_2)[3])
}
Can someone show me how to correct my loop ? I would prefer to use apply function but still would like to know what I did wrong with my loop....

(I'll just use my own modified mtcarsmod ... sorry, yours is missing at least one paren, and -- though I know exactly what happened -- it is ugly in that SO window!)
set.seed(42)
mtcarsmod <- as.data.frame(lapply(mtcars, jitter, factor = 5))
head(mtcarsmod)
# mpg cyl disp hp drat wt qsec vs am gear carb
# 1 21.1 5.55 160 109.7 3.89 2.62 16.5 -0.373 0.221 3.68 3.861
# 2 21.1 6.74 160 110.0 3.90 2.88 17.0 0.641 1.080 3.06 3.788
# 3 22.8 2.02 108 93.5 3.86 2.32 18.6 0.614 1.142 4.73 0.284
# 4 21.5 7.33 258 110.2 3.08 3.21 19.4 0.371 0.238 3.46 0.560
# 5 18.7 6.03 360 175.3 3.15 3.44 17.0 -0.903 0.430 2.63 2.130
# 6 18.1 4.83 225 104.4 2.77 3.46 20.2 0.491 -0.753 2.77 1.870
Instead of a loop you should probably use sapply or one of its kin.
sapply(seq_len(nrow(mtcars)),
function(r) unlist(t.test(mtcars[r,1:7], mtcarsmod[r,1:7])[3]))
# p.value p.value p.value p.value p.value p.value p.value p.value p.value p.value p.value
# 0.998 0.998 0.992 0.996 0.998 0.995 0.999 1.000 0.999 0.998 0.995
# p.value p.value p.value p.value p.value p.value p.value p.value p.value p.value p.value
# 0.995 0.999 0.999 0.998 0.999 0.997 0.999 0.995 0.997 0.995 0.999
# p.value p.value p.value p.value p.value p.value p.value p.value p.value p.value
# 0.997 0.998 1.000 0.990 0.997 0.999 0.999 0.995 0.997 0.995
One advantage to using lapply might be using more of the test results. For instance:
ret <- lapply(seq_len(nrow(mtcars)),
function(r) t.test(mtcars[r,1:7], mtcarsmod[r,1:7]))
str(head(ret, n = 2))
# List of 2
# $ :List of 9
# ..$ statistic : Named num 0.0024
# .. ..- attr(*, "names")= chr "t"
# ..$ parameter : Named num 12
# .. ..- attr(*, "names")= chr "df"
# ..$ p.value : num 0.998
# ..$ conf.int : atomic [1:2] -73.4 73.5
# .. ..- attr(*, "conf.level")= num 0.95
# ..$ estimate : Named num [1:2] 45.7 45.6
# .. ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
# ..$ null.value : Named num 0
# .. ..- attr(*, "names")= chr "difference in means"
# ..$ alternative: chr "two.sided"
# ..$ method : chr "Welch Two Sample t-test"
# ..$ data.name : chr "mtcars[r, 1:7] and mtcarsmod[r, 1:7]"
# ..- attr(*, "class")= chr "htest"
# $ :List of 9
# ..$ statistic : Named num -0.00311
# .. ..- attr(*, "names")= chr "t"
# ..$ parameter : Named num 12
# .. ..- attr(*, "names")= chr "df"
# ..$ p.value : num 0.998
# ..$ conf.int : atomic [1:2] -73.4 73.2
# .. ..- attr(*, "conf.level")= num 0.95
# ..$ estimate : Named num [1:2] 45.8 45.9
# .. ..- attr(*, "names")= chr [1:2] "mean of x" "mean of y"
# ..$ null.value : Named num 0
# .. ..- attr(*, "names")= chr "difference in means"
# ..$ alternative: chr "two.sided"
# ..$ method : chr "Welch Two Sample t-test"
# ..$ data.name : chr "mtcars[r, 1:7] and mtcarsmod[r, 1:7]"
# ..- attr(*, "class")= chr "htest"
ret[[1]]$p.value
# [1] 0.998
And you can still easily get a vector of p-values from the results:
sapply(ret, `[[`, "p.value")
# [1] 0.998 0.998 0.992 0.996 0.998 0.995 0.999 1.000 0.999 0.998 0.995 0.995 0.999 0.999
# [15] 0.998 0.999 0.997 0.999 0.995 0.997 0.995 0.999 0.997 0.998 1.000 0.990 0.997 0.999
# [29] 0.999 0.995 0.997 0.995

Related

Mutate across multiple columns based on condition (length of unique values)

I'm trying to create a function inside mutate() + across() that changes into factor any variable which has five or less unique values (or any arbitrary number) wit the idea of using later that factors to do some grouping. I think the logic of the function is correct but I'm getting some incorrect dimensions error (error in Spanish). For the sake of simplicity, I'm using the mtcars database.
mtcars %>%
mutate(across(1:ncol(.),
function(x) {
if_else(length(unique(x[,i]))<=5,
as.factor(x),
x)}
))
Error: Problem with `mutate()` input `..1`.
i `..1 = across(...)`.
x nĂºmero incorreto de dimensiones
Run `rlang::last_error()` to see where the error occurred.
Any help or advice will be much appreciated.
Here we need if/else as ifelse/if_else requires all arguments to be of equal length. The length(unique expression returns a logical value of length 1 and this may break the condition. Also, with dplyr, we can use select-helpers i.e. everything() to select all the columns
library(dplyr)
out <- mtcars %>%
mutate(across(everything(),
function(x) {
if(length(unique(x))<=5)
as.factor(x) else
x}
))
-output
> str(out)
'data.frame': 32 obs. of 11 variables:
$ mpg : num 21 21 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 ...
$ cyl : Factor w/ 3 levels "4","6","8": 2 2 1 2 3 2 3 1 1 2 ...
$ disp: num 160 160 108 258 360 ...
$ hp : num 110 110 93 110 175 105 245 62 95 123 ...
$ drat: num 3.9 3.9 3.85 3.08 3.15 2.76 3.21 3.69 3.92 3.92 ...
$ wt : num 2.62 2.88 2.32 3.21 3.44 ...
$ qsec: num 16.5 17 18.6 19.4 17 ...
$ vs : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 1 2 2 2 ...
$ am : Factor w/ 2 levels "0","1": 2 2 2 1 1 1 1 1 1 1 ...
$ gear: Factor w/ 3 levels "3","4","5": 2 2 2 1 1 1 1 2 2 2 ...
$ carb: num 4 4 1 1 2 1 4 2 2 4 ...
In addition, the lambda function can be concise with ~ and make use of n_distinct
mtcars %>%
mutate(across(everything(),
~ if(n_distinct(.x) <=5) as.factor(.x) else .x))
Another way would be to use a predicate function in where inside across.
We can either define a custom function:
library(dplyr)
few_unique_vals <- function(x) {
length(unique(x))<=5
}
mtcars %>%
mutate(across(where(few_unique_vals), as.factor)) %>%
glimpse # for better printing
#> Rows: 32
#> Columns: 11
#> $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,~
#> $ cyl <fct> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,~
#> $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16~
#> $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180~
#> $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,~
#> $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.~
#> $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18~
#> $ vs <fct> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,~
#> $ am <fct> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,~
#> $ gear <fct> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,~
#> $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,~
Or we can use an anonymous purrr-style function:
mtcars %>%
mutate(across(where(~ length(unique(.x))<=5),
as.factor)) %>%
glimpse # for better printing
#> Rows: 32
#> Columns: 11
#> $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8,~
#> $ cyl <fct> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8,~
#> $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 16~
#> $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180~
#> $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,~
#> $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.~
#> $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18~
#> $ vs <fct> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,~
#> $ am <fct> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0,~
#> $ gear <fct> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3,~
#> $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2,~
Created on 2022-03-15 by the reprex package (v2.0.1)

How I can view rows and columns of 'Adult' Dataset in R [closed]

Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 4 years ago.
Improve this question
How I can view rows and columns of 'Adult' Dataset in R? I just started learning R.
Any help is appreciated.Pls refer to the screenhotenter image description here
First, start by running str to see the structure of your dataset.
str(Adult)
#Formal class 'transactions' [package "arules"] with 3 slots
# ..# data :Formal class 'ngCMatrix' [package "Matrix"] with #5 slots
# .. .. ..# i : int [1:612200] 1 10 25 32 35 50 59 61 63 65 #...
# .. .. ..# p : int [1:48843] 0 13 26 39 52 65 78 91 104 117 #...
# .. .. ..# Dim : int [1:2] 115 48842
# .. .. ..# Dimnames:List of 2
# .. .. .. ..$ : NULL
# .. .. .. ..$ : NULL
# .. .. ..# factors : list()
# ..# itemInfo :'data.frame': 115 obs. of 3 variables:
# .. ..$ labels : chr [1:115] "age=Young" "age=Middle-aged" #"age=Senior" "age=Old" ...
# .. ..$ variables: Factor w/ 13 levels "age","capital-gain",..: 1 #1 1 1 13 13 13 13 13 13 ...
# .. ..$ levels : Factor w/ 112 levels "10th","11th",..: 111 63 #92 69 30 54 65 82 90 91 ...
# ..# itemsetInfo:'data.frame': 48842 obs. of 1 variable:
# .. ..$ transactionID: chr [1:48842] "1" "2" "3" "4" ...
This tells you that Adult is an S4 object with three slots, data, itemInfo and itemsetInfo.
The slot data is a sparse matrix created by package Matrix;
The slot itemInfo is a data.frame;
The slot itemsetInfo is also a data.frame.
S4 objects' slots are accessed with operator #. In order to see what is in those slots, run
Adult#data
Adult#itemInfo
Adult#itemsetInfo
In the case of the two dataframes, you might prefer to run
head(Adult#itemInfo)
head(Adult#itemsetInfo)
since they have 115 and 48842 observations, respectively and don't fit in a screen display.
To get the text output shown in your example you can use this:
cat(dim(mtcars)[1], "transactions (rows)\n", dim(mtcars)[2], "items (cols)")
#32 transactions (rows)
# 11 items (cols)
Change mtcars with Adult(or any data.frame). cat lets you print out to the console, and dim() gets you rows and columns of the data.
Similarly to str from base R, you can use glimpse from dplyr package:
install.packages("dplyr") # run this the first time to install the package
dplyr::glimpse(mtcars)
# Observations: 32
# Variables: 11
# $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32...
# $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4
# $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 140.8, 167.6, 167.6, 275.8, 275.8, 275.8, 472.0,...
# $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150, 1...
# $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 3.07, 3.07, 3.07, 2.93, 3.00, 3.23, 4....
# $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3.150, 3.440, 3.440, 4.070, 3.730, 3.780, 5.250,...
# $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 22.90, 18.30, 18.90, 17.40, 17.60, 18.00, 17.98,...
# $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1
# $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1
# $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5, 5, 4
# $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1, 2, 2, 4, 6, 8, 2
You have the number of Observations (rows) and Variables (columns), and each variables listed with it's format type, and values.

Matrix Transformation in R - from aggregate output to outer-like matrix

I need to transform the output of an aggregate (mean) into a matrix outer-like style.
data(mtcars)
aggregate(disp ~ cyl + gear, data = mtcars, FUN = mean )
cyl gear disp
4 3 120.1000
6 3 241.5000
8 3 357.6167
4 4 102.6250
6 4 163.8000
4 5 107.7000
6 5 145.0000
8 5 326.0000
What I need is to put the means of disp into a matrix with gear in columns and cyl in rows
Like this
3 4 5
4 120 102 107
6 241 163 145
8 357 NA 326
Do you have any suggestion how I could do this transformation ?
Is there a way to use the function
outer
?
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
You can try tapply
with(mtcars, tapply(disp, list(cyl, gear), FUN=mean))
# 3 4 5
#4 120.1000 102.625 107.7
#6 241.5000 163.800 145.0
#8 357.6167 NA 326.0
If you are looking to reshape the output of aggregate, we can use acast from reshape2
d1 <- aggregate(disp ~ cyl + gear, data = mtcars, FUN = mean )
acast(d1, cyl~gear, value.var='disp')

The most effective way to merge/combine two data sets by overlapping row.names and mean values

I would like to find the most effective way for combining two data frames and average the values in the columns with different row.names . So, I would like to take jsut overlapping row.names from both data and combine them into one. The values from columns should be avaraged by mean. The example datas:
mtcars <-
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
Second data:
mtcars11 <-
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8,
8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6), disp = c(160, 160,
108, 258, 360, 225, 360, 146.7, 140.8, 167.6, 167.6, 275.8, 275.8,
275.8, 472, 460, 440, 78.7, 75.7, 71.1, 120.1, 318, 304, 350,
400, 79, 120.3, 95.1, 351, 145), hp = c(110, 110, 93, 110, 175,
105, 245, 62, 95, 123, 123, 180, 180, 180, 205, 215, 230, 66,
52, 65, 97, 150, 150, 245, 175, 66, 91, 113, 264, 175), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 3.07,
3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 3.15,
3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62), wt = c(2.62, 2.875,
2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44, 3.44, 4.07,
3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 1.615, 1.835, 2.465, 3.52,
3.435, 3.84, 3.845, 1.935, 2.14, 1.513, 3.17, 2.77), qsec = c(16.46,
17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9,
17.4, 17.6, 18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01,
16.87, 17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5), vs = c(0,
0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 1, 0, 1, 0, 0), am = c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1),
gear = c(4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3,
3, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5), carb = c(4, 4,
1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1,
2, 2, 4, 2, 1, 2, 2, 4, 6)), .Names = c("mpg", "cyl", "disp",
"hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Chrysler", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Nexia", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebirda", "Punto", "Porsche 914-2",
"Lotus Europa", "Ford Pantera T", "Ferrari Dino"), class = "data.frame")
So the solution which came to my mind is that (the long one):
vec_names_mt <- row.names(mtcars) ## so we the row.names from first data
vec_names_mt11 <- row.names(mtcars11) ## so we the row.names from second data
vec_inter <- intersect(vec_names_mt, vec_names_mt11) ## find overlapping names
data_mt <- mtcars[row.names(mtcars) %in% vec_inter, ] ## take the rows from first data which overlaps
data_mt11 <- mtcars11[row.names(mtcars11) %in% vec_inter, ] ## take the rows from second data which overlaps
How can we combine them and average the values ? Any idea how to do that in the simplest way ?
Assuming d1 and d2 are your data.frames, here's how I'd approach it. You'll have to use development version of data.table (v1.9.5) though, for mget to work.
require(data.table) # v1.9.5
setkey(setDT(d1, keep.rownames=TRUE), rn)
setkey(setDT(d2, keep.rownames=TRUE), rn)
xcols = names(d1)[-1L]
icols = paste("i.", xcols, sep="")
foo <- function(a, b) mean(c(a, b), na.rm=TRUE)
d1[d2, Map(foo, mget(xcols), mget(icols)), by=.EACHI, nomatch=0L]
We first convert the data.frames to data.tables by reference using setDT, and convert row names to a new column (which will automatically be named rn), and set key on that column.
setkey() reorders a data.table by the columns specified, and marks those columns as key columns, which will help us perform a join (on those key columns).
In data.tables, joins can be accomplished by using the x[i] notation as well as merge() function (there's a data.table method implemented), but x[i] is much more powerful and flexible. The syntax x[i] joins each row of i to matching rows in x (on the key columns).
So, d1[d2] would return for each row in d2 the matching rows in d1, along with all the other columns in d2.
d1[d2, nomatch=0L] is the equivalent of an inner join, where only rows that matches are returned.
d1[d2, Map(foo, mget(xcols), mget(icols)), by=.EACHI, nomatch=0L] evaluates the expression in j = Map(...), for each row in d2 - hence by = .EACHI.
To sum things up, for each row in d2, find the matching rows in d1. Extract the columns specified in xcols and icols just for that matching rows, and apply the function foo() which will concatenate the vectors and take their mean(). And do this for each row of d2 (by = .EACHI). Ignore rows in d2 that doesn't have any matches in d1 on key column (nomatch=0L).
Hope this helps.
It seems like you are looking for an "inner join" between two data sets by the row names. I would suggest to try data.table package for both merging and the later melting and dcasting operation.
First, I will rename mtcars to mtcars2 because mtcars is a stored data set and I don't want both to override it and because setDT actually can't override stored data sets, so lets say that in real life your data is called mtcars2
library(data.table)
mtcars2 <- copy(mtcars)
Next, we will convert to data.table objects, while keeping row names, and setting a key for a faster join
setkey(setDT(mtcars2, keep.rownames = TRUE), rn)
setkey(setDT(mtcars11, keep.rownames = TRUE), rn)
Now we will perform an inner join over rn (the key) while keeping original column names using suffixes = NULL
Res <- merge(mtcars2, mtcars11, suffixes = NULL)
Now we can melt by rn and then dcast by unique columns while computing the mean
dcast(melt(Res, "rn"), rn ~ variable, mean.default)
# rn mpg cyl disp hp drat wt qsec vs am gear carb
# 1: AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
# 2: Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
# 3: Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
# 4: Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
# 5: Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
# 6: Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
# 7: Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# 8: Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# 9: Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
# 10: Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
# 11: Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
# 12: Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
# 13: Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
# 14: Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
# 15: Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
# 16: Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
# 17: Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
# 18: Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# 19: Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# 20: Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
# 21: Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
# 22: Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
# 23: Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
# 24: Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
# 25: Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1

Splitting row names by delimiter into another column in an data frame

This data frame
df <- structure(list(mpg = c(15.2, 10.4, 13.3, 14.7, 22.8, 15.5, 14.3,
19.7, 32.4, 27.3, 15.8, 30.4, 21.4, 18.7, 10.4, 30.4, 15, 21,
21, 22.8, 24.4, 19.2, 17.8, 16.4, 17.3, 15.2, 19.2, 26, 33.9,
21.5, 18.1, 21.4), cyl = c(8, 8, 8, 8, 4, 8, 8, 6, 4, 4, 8, 4,
6, 8, 8, 4, 8, 6, 6, 4, 4, 6, 6, 8, 8, 8, 8, 4, 4, 4, 6, 4),
disp = c(304, 472, 350, 440, 108, 318, 360, 145, 78.7, 79,
351, 75.7, 258, 360, 460, 95.1, 301, 160, 160, 140.8, 146.7,
167.6, 167.6, 275.8, 275.8, 275.8, 400, 120.3, 71.1, 120.1,
225, 121), hp = c(150, 205, 245, 230, 93, 150, 245, 175,
66, 66, 264, 52, 110, 175, 215, 113, 335, 110, 110, 95, 62,
123, 123, 180, 180, 180, 175, 91, 65, 97, 105, 109), drat = c(3.15,
2.93, 3.73, 3.23, 3.85, 2.76, 3.21, 3.62, 4.08, 4.08, 4.22,
4.93, 3.08, 3.15, 3, 3.77, 3.54, 3.9, 3.9, 3.92, 3.69, 3.92,
3.92, 3.07, 3.07, 3.07, 3.08, 4.43, 4.22, 3.7, 2.76, 4.11
), wt = c(3.435, 5.25, 3.84, 5.345, 2.32, 3.52, 3.57, 2.77,
2.2, 1.935, 3.17, 1.615, 3.215, 3.44, 5.424, 1.513, 3.57,
2.62, 2.875, 3.15, 3.19, 3.44, 3.44, 4.07, 3.73, 3.78, 3.845,
2.14, 1.835, 2.465, 3.46, 2.78), qsec = c(17.3, 17.98, 15.41,
17.42, 18.61, 16.87, 15.84, 15.5, 19.47, 18.9, 14.5, 18.52,
19.44, 17.02, 17.82, 16.9, 14.6, 16.46, 17.02, 22.9, 20,
18.3, 18.9, 17.4, 17.6, 18, 17.05, 16.7, 19.9, 20.01, 20.22,
18.6), vs = c(0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1), am = c(0,
0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1), gear = c(3, 3, 3, 3,
4, 3, 3, 5, 4, 4, 5, 4, 3, 3, 3, 5, 5, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 5, 4, 3, 3, 4), carb = c(2, 4, 4, 4, 1, 2, 4,
6, 1, 1, 4, 2, 1, 2, 4, 2, 8, 4, 4, 2, 2, 4, 4, 3, 3, 3,
2, 2, 1, 1, 1, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("AMC Javelin:2.1.2.2.2",
"Cadillac Fleetwood:1.2.1.2.1", "Camaro Z28:1.2.2.1.2.2", "Chrysler Imperial:1.2.1.1",
"Datsun 710:2.2.2.2.1.2.2.2.1", "Dodge Challenger:2.1.2.2.1",
"Duster 360:1.2.2.1.2.1", "Ferrari Dino:2.2.2.1", "Fiat 128:2.2.1.2.2.1",
"Fiat X1-9:2.2.1.2.2.2", "Ford Pantera L:1.2.2.1.1", "Honda Civic:2.2.1.1",
"Hornet 4 Drive:2.1.1.1", "Hornet Sportabout:1.2.2.2.1", "Lincoln Continental:1.2.1.2.2",
"Lotus Europa:2.2.2.2.1.1", "Maserati Bora:1.1", "Mazda RX4:2.2.2.2.2.2.1.1",
"Mazda RX4 Wag:2.2.2.2.2.2.1.2", "Merc 230:2.2.2.2.1.2.1", "Merc 240D:2.2.2.2.2.1",
"Merc 280:2.2.2.2.2.2.2.1", "Merc 280C:2.2.2.2.2.2.2.2", "Merc 450SE:2.1.2.1.2.1",
"Merc 450SL:2.1.2.1.2.2", "Merc 450SLC:2.1.2.1.1", "Pontiac Firebird:1.2.2.2.2",
"Porsche 914-2:2.2.2.2.1.2.2.2.2.2", "Toyota Corolla:2.2.1.2.1",
"Toyota Corona:2.2.2.2.1.2.2.2.2.1", "Valiant:2.1.1.2", "Volvo 142E:2.2.2.2.1.2.2.1"
), class = "data.frame")
produces this:
> head(df)
mpg cyl disp hp drat wt qsec vs am gear carb
AMC Javelin:2.1.2.2.2 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
Cadillac Fleetwood:1.2.1.2.1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
Camaro Z28:1.2.2.1.2.2 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
Chrysler Imperial:1.2.1.1 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
Datsun 710:2.2.2.2.1.2.2.2.1 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Dodge Challenger:2.1.2.2.1 15.5 8 318 150 2.76 3.520 16.87 0 0 3 2
Note that the row names is delimited with ":". What I want to do is to split them
and the 2nd part becomes a new column of the data frame:
ancest mpg cyl disp hp drat wt qsec vs am gear carb
AMC Javelin 2.1.2.2.2 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
Cadillac Fleetwood 1.2.1.2.1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
What's the way to do it?
I'm stuck with this:
rn <- rownames(df)
unlist(lapply(rn,strsplit,":"))
We can use strsplit to get the output in a "list", rbind the output to get a matrix "m1". Change the rownames of "df" by the first column and create a new column "ancest" with the second column of "m1"
m1 <- do.call(rbind, strsplit(rn, ':'))
row.names(df) <- m1[,1]
df['ancest'] <- m1[,2]
Or if you need the first column of the dataset as one of the split columns,
df1 <- cbind(ancest=m1[,2], df)
row.names(df1) <- m1[,1]
Or using splitstackshape and data.table
library(data.table)
library(splitstackshape)
df1 <- setDF(cSplit(setDT(df, keep.rownames=TRUE)[],
'rn', sep=":")[, c(12), with=FALSE])
rownames(df1) <- df1[,13]
df1 <- df1[-13]
Try this:
# create a new variable with the row names
df$names <- rownames((df)
# split the new variable into two pieces, delete the pattern (the :), and keep both pieces
df$names <- stringr::str_split_fixed(df$names, ":", 2)
Using the sapply and [ functions:
nm_plus_ancest <- rownames(df)
nm_plus_ancest_split <- strsplit(nm_plus_ancest, ":")
rownames(df) <- sapply(nm_plus_ancest_split, "[", 1)
df$ancest <- sapply(nm_plus_ancest_split, "[", 2)
And you can rearrange the columns with the nice dplyr::select function:
require(dplyr)
df <- select(df, ancest, mpg:carb)

Resources