Splitting row names by delimiter into another column in an data frame

Splitting row names by delimiter into another column in an data frame - r

This data frame
df <- structure(list(mpg = c(15.2, 10.4, 13.3, 14.7, 22.8, 15.5, 14.3,
19.7, 32.4, 27.3, 15.8, 30.4, 21.4, 18.7, 10.4, 30.4, 15, 21,
21, 22.8, 24.4, 19.2, 17.8, 16.4, 17.3, 15.2, 19.2, 26, 33.9,
21.5, 18.1, 21.4), cyl = c(8, 8, 8, 8, 4, 8, 8, 6, 4, 4, 8, 4,
6, 8, 8, 4, 8, 6, 6, 4, 4, 6, 6, 8, 8, 8, 8, 4, 4, 4, 6, 4),
disp = c(304, 472, 350, 440, 108, 318, 360, 145, 78.7, 79,
351, 75.7, 258, 360, 460, 95.1, 301, 160, 160, 140.8, 146.7,
167.6, 167.6, 275.8, 275.8, 275.8, 400, 120.3, 71.1, 120.1,
225, 121), hp = c(150, 205, 245, 230, 93, 150, 245, 175,
66, 66, 264, 52, 110, 175, 215, 113, 335, 110, 110, 95, 62,
123, 123, 180, 180, 180, 175, 91, 65, 97, 105, 109), drat = c(3.15,
2.93, 3.73, 3.23, 3.85, 2.76, 3.21, 3.62, 4.08, 4.08, 4.22,
4.93, 3.08, 3.15, 3, 3.77, 3.54, 3.9, 3.9, 3.92, 3.69, 3.92,
3.92, 3.07, 3.07, 3.07, 3.08, 4.43, 4.22, 3.7, 2.76, 4.11
), wt = c(3.435, 5.25, 3.84, 5.345, 2.32, 3.52, 3.57, 2.77,
2.2, 1.935, 3.17, 1.615, 3.215, 3.44, 5.424, 1.513, 3.57,
2.62, 2.875, 3.15, 3.19, 3.44, 3.44, 4.07, 3.73, 3.78, 3.845,
2.14, 1.835, 2.465, 3.46, 2.78), qsec = c(17.3, 17.98, 15.41,
17.42, 18.61, 16.87, 15.84, 15.5, 19.47, 18.9, 14.5, 18.52,
19.44, 17.02, 17.82, 16.9, 14.6, 16.46, 17.02, 22.9, 20,
18.3, 18.9, 17.4, 17.6, 18, 17.05, 16.7, 19.9, 20.01, 20.22,
18.6), vs = c(0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1), am = c(0,
0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1), gear = c(3, 3, 3, 3,
4, 3, 3, 5, 4, 4, 5, 4, 3, 3, 3, 5, 5, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 5, 4, 3, 3, 4), carb = c(2, 4, 4, 4, 1, 2, 4,
6, 1, 1, 4, 2, 1, 2, 4, 2, 8, 4, 4, 2, 2, 4, 4, 3, 3, 3,
2, 2, 1, 1, 1, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("AMC Javelin:2.1.2.2.2",
"Cadillac Fleetwood:1.2.1.2.1", "Camaro Z28:1.2.2.1.2.2", "Chrysler Imperial:1.2.1.1",
"Datsun 710:2.2.2.2.1.2.2.2.1", "Dodge Challenger:2.1.2.2.1",
"Duster 360:1.2.2.1.2.1", "Ferrari Dino:2.2.2.1", "Fiat 128:2.2.1.2.2.1",
"Fiat X1-9:2.2.1.2.2.2", "Ford Pantera L:1.2.2.1.1", "Honda Civic:2.2.1.1",
"Hornet 4 Drive:2.1.1.1", "Hornet Sportabout:1.2.2.2.1", "Lincoln Continental:1.2.1.2.2",
"Lotus Europa:2.2.2.2.1.1", "Maserati Bora:1.1", "Mazda RX4:2.2.2.2.2.2.1.1",
"Mazda RX4 Wag:2.2.2.2.2.2.1.2", "Merc 230:2.2.2.2.1.2.1", "Merc 240D:2.2.2.2.2.1",
"Merc 280:2.2.2.2.2.2.2.1", "Merc 280C:2.2.2.2.2.2.2.2", "Merc 450SE:2.1.2.1.2.1",
"Merc 450SL:2.1.2.1.2.2", "Merc 450SLC:2.1.2.1.1", "Pontiac Firebird:1.2.2.2.2",
"Porsche 914-2:2.2.2.2.1.2.2.2.2.2", "Toyota Corolla:2.2.1.2.1",
"Toyota Corona:2.2.2.2.1.2.2.2.2.1", "Valiant:2.1.1.2", "Volvo 142E:2.2.2.2.1.2.2.1"
), class = "data.frame")
produces this:
> head(df)
mpg cyl disp hp drat wt qsec vs am gear carb
AMC Javelin:2.1.2.2.2 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
Cadillac Fleetwood:1.2.1.2.1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
Camaro Z28:1.2.2.1.2.2 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
Chrysler Imperial:1.2.1.1 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
Datsun 710:2.2.2.2.1.2.2.2.1 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Dodge Challenger:2.1.2.2.1 15.5 8 318 150 2.76 3.520 16.87 0 0 3 2
Note that the row names is delimited with ":". What I want to do is to split them
and the 2nd part becomes a new column of the data frame:
ancest mpg cyl disp hp drat wt qsec vs am gear carb
AMC Javelin 2.1.2.2.2 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
Cadillac Fleetwood 1.2.1.2.1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
What's the way to do it?
I'm stuck with this:
rn <- rownames(df)
unlist(lapply(rn,strsplit,":"))

We can use strsplit to get the output in a "list", rbind the output to get a matrix "m1". Change the rownames of "df" by the first column and create a new column "ancest" with the second column of "m1"
m1 <- do.call(rbind, strsplit(rn, ':'))
row.names(df) <- m1[,1]
df['ancest'] <- m1[,2]
Or if you need the first column of the dataset as one of the split columns,
df1 <- cbind(ancest=m1[,2], df)
row.names(df1) <- m1[,1]
Or using splitstackshape and data.table
library(data.table)
library(splitstackshape)
df1 <- setDF(cSplit(setDT(df, keep.rownames=TRUE)[],
'rn', sep=":")[, c(12), with=FALSE])
rownames(df1) <- df1[,13]
df1 <- df1[-13]

Try this:
# create a new variable with the row names
df$names <- rownames((df)
# split the new variable into two pieces, delete the pattern (the :), and keep both pieces
df$names <- stringr::str_split_fixed(df$names, ":", 2)

Using the sapply and [ functions:
nm_plus_ancest <- rownames(df)
nm_plus_ancest_split <- strsplit(nm_plus_ancest, ":")
rownames(df) <- sapply(nm_plus_ancest_split, "[", 1)
df$ancest <- sapply(nm_plus_ancest_split, "[", 2)
And you can rearrange the columns with the nice dplyr::select function:
require(dplyr)
df <- select(df, ancest, mpg:carb)

Related

Split and create a new dataframes for each variable in a specific column

I'm not exactly sure how to make progress on this question. I'm using here this mtcars dataset:
structure(list(index = 1:32, car = c("Mazda RX4", "Mazda RX4 Wag",
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C",
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), row.names = c(NA, -32L), class = c("tbl_df",
"tbl", "data.frame"))
Here is the pseudo-code that I have written.
for (i in 1:length(car)) {
mtcars %>%
filter(car == car[i]) %>%
mtcars_i <- mtcars
}
The idea here is that I would like create 32 different datasets with the name of each car in the label for this particular dataset.
mtcars_mazda_rx4
mtcars_hornet_sportabout
etc.
Here mtcars_mazda_rx4 would be a dataframe with all the same variables but only one observation, where car == "Mazda RX4", i.e. mtcars[car == "Mazda RX4",]
Is there a way to create a for loop that filters the dataframe by a specific variable, and then outputs a new dataframe with that variable name identified in the new df?

We can use assign
for (i in 1:length(car)) {
tmp <- mtcars %>%
filter(car == car[i])
assign(paste0('mtcars_', car[i]), tmp)
}

Just a different approach using split; I am using dplyr to make the solution more legible;
library(tidyverse)
mtcars %>%
# rownames_to_column("car") %>% ## run this line if you are using original mtcars
split(., .$car) %>%
set_names(., nm = paste0("mtcars_", names(.))) %>%
list2env(., envir=.GlobalEnv)

How to extract values from a column corresponding to 100+ values stored in df [duplicate]

This question already has answers here:
subset a column in data frame based on another data frame/list
(2 answers)
Closed 2 years ago.
My data is big but I am taking example of mtcars database in R.
What I want exactly is to extract "Cyl" column values corresponding to df(a data frame I have created which have some values from column "mpg"). I want to extract values of column Cyl corresponding to the values of mpg stored in a data frame df.
> dput(mtcars)
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag",
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C",
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
dput(df)
structure(list(vals = c(21, 22.8, 15.2, 19.2, 17.8, 13.3, 15.5,
30.4, 10.4)), class = "data.frame", row.names = c(NA, -9L))
#I tried this
mtcars22 %>% filter(cyl,mpg==df)

You can use :
mtcars22$cyl[mtcars22$mpg %in% df$vals]
#[1] 6 6 4 4 6 6 8 8 8 4 8 8 8 8 4
Or
subset(mtcars22, mpg %in% df$vals, select = cyl)

Deleting all rows until certain value - than do the same for the next group

I have a dataset about the returns of stocks in the last 30 years. Now I need to delete all rows(years) for a company until the first row, which isn´t NA. But I need to leave all other rows with NA for that company, that may occur later. Then the code should jump to the next company(Id) and restart the process.
I already tried the following code, but to be honest I´m kind of lost.
cleaning <- function (DT, colnames){
for(cols in colnames)
if(is.na(cols)){
DT[, cols := NULL]
} else {
break
}
}
MergedDT[, cleaning(MergedDT, RET), by = "Id"]
I received the following warning for that code:
> 1: In `[.data.table`(DT, , `:=`(cols, NULL)) : Adding new column
> 'cols' then assigning NULL (deleting it).
Furthermore, I think that there is a way more efficient way to solve that problem.

A combination of group_by, to do the analysis per company (or per cyl in this example) and do to find the first instance in which years (or mpg) is not NA should work:
df <- structure(list(model = c("Datsun 710", "Merc 240D", "Merc 230",
"Fiat 128", "Honda Civic", "Toyota Corolla", "Toyota Corona",
"Fiat X1-9", "Porsche 914-2", "Lotus Europa", "Volvo 142E", "Mazda RX4",
"Mazda RX4 Wag", "Hornet 4 Drive", "Valiant", "Merc 280", "Merc 280C",
"Ferrari Dino", "Hornet Sportabout", "Duster 360", "Merc 450SE",
"Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental",
"Chrysler Imperial", "Dodge Challenger", "AMC Javelin", "Camaro Z28",
"Pontiac Firebird", "Ford Pantera L", "Maserati Bora"), mpg = c(NA,
NA, NA, NA, NA, 33.9, 21.5, NA, 26, 30.4, 21.4, NA, NA, NA, 18.1,
19.2, 17.8, 19.7, NA, NA, NA, NA, 15.2, 10.4, 10.4, 14.7, 15.5,
15.2, 13.3, 19.2, 15.8, 15), cyl = c(4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8), disp = c(108, 146.7, 140.8, 78.7, 75.7, 71.1, 120.1,
79, 120.3, 95.1, 121, 160, 160, 258, 225, 167.6, 167.6, 145,
360, 360, 275.8, 275.8, 275.8, 472, 460, 440, 318, 304, 350,
400, 351, 301), hp = c(93, 62, 95, 66, 52, 65, 97, 66, 91, 113,
109, 110, 110, 110, 105, 123, 123, 175, 175, 245, 180, 180, 180,
205, 215, 230, 150, 150, 245, 175, 264, 335), drat = c(3.85,
3.69, 3.92, 4.08, 4.93, 4.22, 3.7, 4.08, 4.43, 3.77, 4.11, 3.9,
3.9, 3.08, 2.76, 3.92, 3.92, 3.62, 3.15, 3.21, 3.07, 3.07, 3.07,
2.93, 3, 3.23, 2.76, 3.15, 3.73, 3.08, 4.22, 3.54), wt = c(2.32,
3.19, 3.15, 2.2, 1.615, 1.835, 2.465, 1.935, 2.14, 1.513, 2.78,
2.62, 2.875, 3.215, 3.46, 3.44, 3.44, 2.77, 3.44, 3.57, 4.07,
3.73, 3.78, 5.25, 5.424, 5.345, 3.52, 3.435, 3.84, 3.845, 3.17,
3.57), qsec = c(18.61, 20, 22.9, 19.47, 18.52, 19.9, 20.01, 18.9,
16.7, 16.9, 18.6, 16.46, 17.02, 19.44, 20.22, 18.3, 18.9, 15.5,
17.02, 15.84, 17.4, 17.6, 18, 17.98, 17.82, 17.42, 16.87, 17.3,
15.41, 17.05, 14.5, 14.6), vs = c(1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), am = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1), gear = c(4,
4, 4, 4, 4, 4, 3, 4, 5, 5, 4, 4, 4, 3, 3, 4, 4, 5, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 5, 5), carb = c(1, 2, 2, 1, 2, 1, 1,
1, 2, 2, 2, 4, 4, 1, 1, 4, 4, 6, 2, 4, 3, 3, 3, 4, 4, 4, 2, 2,
4, 2, 4, 8)), row.names = c(NA, -32L), class = c("tbl_df", "tbl",
"data.frame"))
df %>%
group_by(cyl) %>%
do(
.[first(which(!is.na(.$mpg))):nrow(.),]
)

Iiuc, you are looking to trim beginning NA returns for each ID, here is an option:
DT[-DT[,.I[seq_len(match(TRUE, !is.na(RET)) - 1L)], .(ID)]$V1]
output:
ID RET
1: 1 0.02
2: 1 NA
3: 2 0.01
4: 2 NA
5: 3 0.01
6: 3 0.05
7: 3 0.02
data:
DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))
DT:
ID RET
1: 1 NA
2: 1 0.02
3: 1 NA
4: 2 NA
5: 2 NA
6: 2 0.01
7: 2 NA
8: 3 0.01
9: 3 0.05
10: 3 0.02

DT[DT[, .I[cumsum(!is.na(RET)) > 0], ID]$V1]
ID RET
1: 1 0.02
2: 1 NA
3: 2 0.01
4: 2 NA
5: 3 0.01
6: 3 0.05
7: 3 0.02
Data (stolen from chinsoon12 (Original question poster failed to provide reproducible data)):
DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))

Matrix Transformation in R - from aggregate output to outer-like matrix

I need to transform the output of an aggregate (mean) into a matrix outer-like style.
data(mtcars)
aggregate(disp ~ cyl + gear, data = mtcars, FUN = mean )
cyl gear disp
4 3 120.1000
6 3 241.5000
8 3 357.6167
4 4 102.6250
6 4 163.8000
4 5 107.7000
6 5 145.0000
8 5 326.0000
What I need is to put the means of disp into a matrix with gear in columns and cyl in rows
Like this
3 4 5
4 120 102 107
6 241 163 145
8 357 NA 326
Do you have any suggestion how I could do this transformation ?
Is there a way to use the function
outer
?
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")

You can try tapply
with(mtcars, tapply(disp, list(cyl, gear), FUN=mean))
# 3 4 5
#4 120.1000 102.625 107.7
#6 241.5000 163.800 145.0
#8 357.6167 NA 326.0
If you are looking to reshape the output of aggregate, we can use acast from reshape2
d1 <- aggregate(disp ~ cyl + gear, data = mtcars, FUN = mean )
acast(d1, cyl~gear, value.var='disp')

The most effective way to merge/combine two data sets by overlapping row.names and mean values

I would like to find the most effective way for combining two data frames and average the values in the columns with different row.names . So, I would like to take jsut overlapping row.names from both data and combine them into one. The values from columns should be avaraged by mean. The example datas:
mtcars <-
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
Second data:
mtcars11 <-
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8,
8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6), disp = c(160, 160,
108, 258, 360, 225, 360, 146.7, 140.8, 167.6, 167.6, 275.8, 275.8,
275.8, 472, 460, 440, 78.7, 75.7, 71.1, 120.1, 318, 304, 350,
400, 79, 120.3, 95.1, 351, 145), hp = c(110, 110, 93, 110, 175,
105, 245, 62, 95, 123, 123, 180, 180, 180, 205, 215, 230, 66,
52, 65, 97, 150, 150, 245, 175, 66, 91, 113, 264, 175), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92, 3.07,
3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76, 3.15,
3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62), wt = c(2.62, 2.875,
2.32, 3.215, 3.44, 3.46, 3.57, 3.19, 3.15, 3.44, 3.44, 4.07,
3.73, 3.78, 5.25, 5.424, 5.345, 2.2, 1.615, 1.835, 2.465, 3.52,
3.435, 3.84, 3.845, 1.935, 2.14, 1.513, 3.17, 2.77), qsec = c(16.46,
17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9,
17.4, 17.6, 18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01,
16.87, 17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5), vs = c(0,
0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 1, 0, 1, 0, 0), am = c(1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1),
gear = c(4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3,
3, 4, 4, 4, 3, 3, 3, 3, 3, 4, 5, 5, 5, 5), carb = c(4, 4,
1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1,
2, 2, 4, 2, 1, 2, 2, 4, 6)), .Names = c("mpg", "cyl", "disp",
"hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Chrysler", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Nexia", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebirda", "Punto", "Porsche 914-2",
"Lotus Europa", "Ford Pantera T", "Ferrari Dino"), class = "data.frame")
So the solution which came to my mind is that (the long one):
vec_names_mt <- row.names(mtcars) ## so we the row.names from first data
vec_names_mt11 <- row.names(mtcars11) ## so we the row.names from second data
vec_inter <- intersect(vec_names_mt, vec_names_mt11) ## find overlapping names
data_mt <- mtcars[row.names(mtcars) %in% vec_inter, ] ## take the rows from first data which overlaps
data_mt11 <- mtcars11[row.names(mtcars11) %in% vec_inter, ] ## take the rows from second data which overlaps
How can we combine them and average the values ? Any idea how to do that in the simplest way ?

Assuming d1 and d2 are your data.frames, here's how I'd approach it. You'll have to use development version of data.table (v1.9.5) though, for mget to work.
require(data.table) # v1.9.5
setkey(setDT(d1, keep.rownames=TRUE), rn)
setkey(setDT(d2, keep.rownames=TRUE), rn)
xcols = names(d1)[-1L]
icols = paste("i.", xcols, sep="")
foo <- function(a, b) mean(c(a, b), na.rm=TRUE)
d1[d2, Map(foo, mget(xcols), mget(icols)), by=.EACHI, nomatch=0L]
We first convert the data.frames to data.tables by reference using setDT, and convert row names to a new column (which will automatically be named rn), and set key on that column.
setkey() reorders a data.table by the columns specified, and marks those columns as key columns, which will help us perform a join (on those key columns).
In data.tables, joins can be accomplished by using the x[i] notation as well as merge() function (there's a data.table method implemented), but x[i] is much more powerful and flexible. The syntax x[i] joins each row of i to matching rows in x (on the key columns).
So, d1[d2] would return for each row in d2 the matching rows in d1, along with all the other columns in d2.
d1[d2, nomatch=0L] is the equivalent of an inner join, where only rows that matches are returned.
d1[d2, Map(foo, mget(xcols), mget(icols)), by=.EACHI, nomatch=0L] evaluates the expression in j = Map(...), for each row in d2 - hence by = .EACHI.
To sum things up, for each row in d2, find the matching rows in d1. Extract the columns specified in xcols and icols just for that matching rows, and apply the function foo() which will concatenate the vectors and take their mean(). And do this for each row of d2 (by = .EACHI). Ignore rows in d2 that doesn't have any matches in d1 on key column (nomatch=0L).
Hope this helps.

It seems like you are looking for an "inner join" between two data sets by the row names. I would suggest to try data.table package for both merging and the later melting and dcasting operation.
First, I will rename mtcars to mtcars2 because mtcars is a stored data set and I don't want both to override it and because setDT actually can't override stored data sets, so lets say that in real life your data is called mtcars2
library(data.table)
mtcars2 <- copy(mtcars)
Next, we will convert to data.table objects, while keeping row names, and setting a key for a faster join
setkey(setDT(mtcars2, keep.rownames = TRUE), rn)
setkey(setDT(mtcars11, keep.rownames = TRUE), rn)
Now we will perform an inner join over rn (the key) while keeping original column names using suffixes = NULL
Res <- merge(mtcars2, mtcars11, suffixes = NULL)
Now we can melt by rn and then dcast by unique columns while computing the mean
dcast(melt(Res, "rn"), rn ~ variable, mean.default)
# rn mpg cyl disp hp drat wt qsec vs am gear carb
# 1: AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
# 2: Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
# 3: Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
# 4: Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
# 5: Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
# 6: Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
# 7: Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# 8: Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
# 9: Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
# 10: Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
# 11: Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
# 12: Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
# 13: Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
# 14: Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
# 15: Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
# 16: Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
# 17: Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
# 18: Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
# 19: Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
# 20: Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
# 21: Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
# 22: Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
# 23: Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
# 24: Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
# 25: Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Splitting row names by delimiter into another column in an data frame - r

Try this: # create a new variable with the row names df$names <- rownames((df) # split the new variable into two pieces, delete the pattern (the :), and keep both pieces df$names <- stringr::str_split_fixed(df$names, ":", 2)

Related

Split and create a new dataframes for each variable in a specific column

How to extract values from a column corresponding to 100+ values stored in df [duplicate]

Deleting all rows until certain value - than do the same for the next group

Matrix Transformation in R - from aggregate output to outer-like matrix

The most effective way to merge/combine two data sets by overlapping row.names and mean values

Categories

Resources