Related
I'm not exactly sure how to make progress on this question. I'm using here this mtcars dataset:
structure(list(index = 1:32, car = c("Mazda RX4", "Mazda RX4 Wag",
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C",
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), row.names = c(NA, -32L), class = c("tbl_df",
"tbl", "data.frame"))
Here is the pseudo-code that I have written.
for (i in 1:length(car)) {
mtcars %>%
filter(car == car[i]) %>%
mtcars_i <- mtcars
}
The idea here is that I would like create 32 different datasets with the name of each car in the label for this particular dataset.
mtcars_mazda_rx4
mtcars_hornet_sportabout
etc.
Here mtcars_mazda_rx4 would be a dataframe with all the same variables but only one observation, where car == "Mazda RX4", i.e. mtcars[car == "Mazda RX4",]
Is there a way to create a for loop that filters the dataframe by a specific variable, and then outputs a new dataframe with that variable name identified in the new df?
We can use assign
for (i in 1:length(car)) {
tmp <- mtcars %>%
filter(car == car[i])
assign(paste0('mtcars_', car[i]), tmp)
}
Just a different approach using split; I am using dplyr to make the solution more legible;
library(tidyverse)
mtcars %>%
# rownames_to_column("car") %>% ## run this line if you are using original mtcars
split(., .$car) %>%
set_names(., nm = paste0("mtcars_", names(.))) %>%
list2env(., envir=.GlobalEnv)
This question already has answers here:
subset a column in data frame based on another data frame/list
(2 answers)
Closed 2 years ago.
My data is big but I am taking example of mtcars database in R.
What I want exactly is to extract "Cyl" column values corresponding to df(a data frame I have created which have some values from column "mpg"). I want to extract values of column Cyl corresponding to the values of mpg stored in a data frame df.
> dput(mtcars)
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), row.names = c("Mazda RX4", "Mazda RX4 Wag",
"Datsun 710", "Hornet 4 Drive", "Hornet Sportabout", "Valiant",
"Duster 360", "Merc 240D", "Merc 230", "Merc 280", "Merc 280C",
"Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
dput(df)
structure(list(vals = c(21, 22.8, 15.2, 19.2, 17.8, 13.3, 15.5,
30.4, 10.4)), class = "data.frame", row.names = c(NA, -9L))
#I tried this
mtcars22 %>% filter(cyl,mpg==df)
You can use :
mtcars22$cyl[mtcars22$mpg %in% df$vals]
#[1] 6 6 4 4 6 6 8 8 8 4 8 8 8 8 4
Or
subset(mtcars22, mpg %in% df$vals, select = cyl)
I have a dataset about the returns of stocks in the last 30 years. Now I need to delete all rows(years) for a company until the first row, which isn´t NA. But I need to leave all other rows with NA for that company, that may occur later. Then the code should jump to the next company(Id) and restart the process.
I already tried the following code, but to be honest I´m kind of lost.
cleaning <- function (DT, colnames){
for(cols in colnames)
if(is.na(cols)){
DT[, cols := NULL]
} else {
break
}
}
MergedDT[, cleaning(MergedDT, RET), by = "Id"]
I received the following warning for that code:
> 1: In `[.data.table`(DT, , `:=`(cols, NULL)) : Adding new column
> 'cols' then assigning NULL (deleting it).
Furthermore, I think that there is a way more efficient way to solve that problem.
A combination of group_by, to do the analysis per company (or per cyl in this example) and do to find the first instance in which years (or mpg) is not NA should work:
df <- structure(list(model = c("Datsun 710", "Merc 240D", "Merc 230",
"Fiat 128", "Honda Civic", "Toyota Corolla", "Toyota Corona",
"Fiat X1-9", "Porsche 914-2", "Lotus Europa", "Volvo 142E", "Mazda RX4",
"Mazda RX4 Wag", "Hornet 4 Drive", "Valiant", "Merc 280", "Merc 280C",
"Ferrari Dino", "Hornet Sportabout", "Duster 360", "Merc 450SE",
"Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood", "Lincoln Continental",
"Chrysler Imperial", "Dodge Challenger", "AMC Javelin", "Camaro Z28",
"Pontiac Firebird", "Ford Pantera L", "Maserati Bora"), mpg = c(NA,
NA, NA, NA, NA, 33.9, 21.5, NA, 26, 30.4, 21.4, NA, NA, NA, 18.1,
19.2, 17.8, 19.7, NA, NA, NA, NA, 15.2, 10.4, 10.4, 14.7, 15.5,
15.2, 13.3, 19.2, 15.8, 15), cyl = c(4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 6, 6, 6, 6, 6, 6, 6, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
8, 8, 8), disp = c(108, 146.7, 140.8, 78.7, 75.7, 71.1, 120.1,
79, 120.3, 95.1, 121, 160, 160, 258, 225, 167.6, 167.6, 145,
360, 360, 275.8, 275.8, 275.8, 472, 460, 440, 318, 304, 350,
400, 351, 301), hp = c(93, 62, 95, 66, 52, 65, 97, 66, 91, 113,
109, 110, 110, 110, 105, 123, 123, 175, 175, 245, 180, 180, 180,
205, 215, 230, 150, 150, 245, 175, 264, 335), drat = c(3.85,
3.69, 3.92, 4.08, 4.93, 4.22, 3.7, 4.08, 4.43, 3.77, 4.11, 3.9,
3.9, 3.08, 2.76, 3.92, 3.92, 3.62, 3.15, 3.21, 3.07, 3.07, 3.07,
2.93, 3, 3.23, 2.76, 3.15, 3.73, 3.08, 4.22, 3.54), wt = c(2.32,
3.19, 3.15, 2.2, 1.615, 1.835, 2.465, 1.935, 2.14, 1.513, 2.78,
2.62, 2.875, 3.215, 3.46, 3.44, 3.44, 2.77, 3.44, 3.57, 4.07,
3.73, 3.78, 5.25, 5.424, 5.345, 3.52, 3.435, 3.84, 3.845, 3.17,
3.57), qsec = c(18.61, 20, 22.9, 19.47, 18.52, 19.9, 20.01, 18.9,
16.7, 16.9, 18.6, 16.46, 17.02, 19.44, 20.22, 18.3, 18.9, 15.5,
17.02, 15.84, 17.4, 17.6, 18, 17.98, 17.82, 17.42, 16.87, 17.3,
15.41, 17.05, 14.5, 14.6), vs = c(1, 1, 1, 1, 1, 1, 1, 1, 0,
1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0), am = c(1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0,
0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1), gear = c(4,
4, 4, 4, 4, 4, 3, 4, 5, 5, 4, 4, 4, 3, 3, 4, 4, 5, 3, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3, 3, 5, 5), carb = c(1, 2, 2, 1, 2, 1, 1,
1, 2, 2, 2, 4, 4, 1, 1, 4, 4, 6, 2, 4, 3, 3, 3, 4, 4, 4, 2, 2,
4, 2, 4, 8)), row.names = c(NA, -32L), class = c("tbl_df", "tbl",
"data.frame"))
df %>%
group_by(cyl) %>%
do(
.[first(which(!is.na(.$mpg))):nrow(.),]
)
Iiuc, you are looking to trim beginning NA returns for each ID, here is an option:
DT[-DT[,.I[seq_len(match(TRUE, !is.na(RET)) - 1L)], .(ID)]$V1]
output:
ID RET
1: 1 0.02
2: 1 NA
3: 2 0.01
4: 2 NA
5: 3 0.01
6: 3 0.05
7: 3 0.02
data:
DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))
DT:
ID RET
1: 1 NA
2: 1 0.02
3: 1 NA
4: 2 NA
5: 2 NA
6: 2 0.01
7: 2 NA
8: 3 0.01
9: 3 0.05
10: 3 0.02
DT[DT[, .I[cumsum(!is.na(RET)) > 0], ID]$V1]
ID RET
1: 1 0.02
2: 1 NA
3: 2 0.01
4: 2 NA
5: 3 0.01
6: 3 0.05
7: 3 0.02
Data (stolen from chinsoon12 (Original question poster failed to provide reproducible data)):
DT <- data.table(ID=c(1,1,1,2,2,2,2,3,3,3), RET=c(NA,0.02,NA, NA,NA,0.01,NA, 0.01,0.05,0.02))
For example I have a code like that:
structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
I thought it would be easy just function dget is needed but it looks that I can't load the data.frame like that. Can you tell me how to easly load the code above ?
Use the assignment operator <-:
my.object.df <- structure(list(mpg = c(21, 21, 22.8, 21.4, 18.7, 18.1, 14.3,
24.4, 22.8, 19.2, 17.8, 16.4, 17.3, 15.2, 10.4, 10.4, 14.7, 32.4,
30.4, 33.9, 21.5, 15.5, 15.2, 13.3, 19.2, 27.3, 26, 30.4, 15.8,
19.7, 15, 21.4), cyl = c(6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8,
8, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 8, 6, 8, 4),
disp = c(160, 160, 108, 258, 360, 225, 360, 146.7, 140.8,
167.6, 167.6, 275.8, 275.8, 275.8, 472, 460, 440, 78.7, 75.7,
71.1, 120.1, 318, 304, 350, 400, 79, 120.3, 95.1, 351, 145,
301, 121), hp = c(110, 110, 93, 110, 175, 105, 245, 62, 95,
123, 123, 180, 180, 180, 205, 215, 230, 66, 52, 65, 97, 150,
150, 245, 175, 66, 91, 113, 264, 175, 335, 109), drat = c(3.9,
3.9, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.92, 3.92,
3.07, 3.07, 3.07, 2.93, 3, 3.23, 4.08, 4.93, 4.22, 3.7, 2.76,
3.15, 3.73, 3.08, 4.08, 4.43, 3.77, 4.22, 3.62, 3.54, 4.11
), wt = c(2.62, 2.875, 2.32, 3.215, 3.44, 3.46, 3.57, 3.19,
3.15, 3.44, 3.44, 4.07, 3.73, 3.78, 5.25, 5.424, 5.345, 2.2,
1.615, 1.835, 2.465, 3.52, 3.435, 3.84, 3.845, 1.935, 2.14,
1.513, 3.17, 2.77, 3.57, 2.78), qsec = c(16.46, 17.02, 18.61,
19.44, 17.02, 20.22, 15.84, 20, 22.9, 18.3, 18.9, 17.4, 17.6,
18, 17.98, 17.82, 17.42, 19.47, 18.52, 19.9, 20.01, 16.87,
17.3, 15.41, 17.05, 18.9, 16.7, 16.9, 14.5, 15.5, 14.6, 18.6
), vs = c(0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1), am = c(1,
1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1), gear = c(4, 4, 4, 3,
3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 3, 3,
3, 3, 4, 5, 5, 5, 5, 5, 4), carb = c(4, 4, 1, 1, 2, 1, 4,
2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, 1, 1, 2, 2, 4, 2, 1,
2, 2, 4, 6, 8, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("Mazda RX4",
"Mazda RX4 Wag", "Datsun 710", "Hornet 4 Drive", "Hornet Sportabout",
"Valiant", "Duster 360", "Merc 240D", "Merc 230", "Merc 280",
"Merc 280C", "Merc 450SE", "Merc 450SL", "Merc 450SLC", "Cadillac Fleetwood",
"Lincoln Continental", "Chrysler Imperial", "Fiat 128", "Honda Civic",
"Toyota Corolla", "Toyota Corona", "Dodge Challenger", "AMC Javelin",
"Camaro Z28", "Pontiac Firebird", "Fiat X1-9", "Porsche 914-2",
"Lotus Europa", "Ford Pantera L", "Ferrari Dino", "Maserati Bora",
"Volvo 142E"), class = "data.frame")
You could also make a text file and then use source to read and execute it. (It would, of course, be faster to load it from an '.Rdata' file created with save.)
This is a small example.
> test <- c(1.1, 2,3, 4)
> dput(test)
c(1.1, 2, 3, 4)
> dput( test, "mytest.txt")
> dget("mytest.txt")
[1] 1.1 2.0 3.0 4.0
> newtest <- dget("mytest.txt")
> newtest
[1] 1.1 2.0 3.0 4.0
Notice that the object needs to be assigned at the time of input with dget. If on the other hand you save the first line of the example (with an assignment to a name in the text) above as a text file, you can bring if back in with eval(parse(...))
> dget("secondtest.txt")
> test
Error: object 'test' not found
> ?load # Nope, that only works for binary files
> eval( parse(file= "secondtest.txt"))
> test
[1] 1.1 2.0 3.0 4.0
This data frame
df <- structure(list(mpg = c(15.2, 10.4, 13.3, 14.7, 22.8, 15.5, 14.3,
19.7, 32.4, 27.3, 15.8, 30.4, 21.4, 18.7, 10.4, 30.4, 15, 21,
21, 22.8, 24.4, 19.2, 17.8, 16.4, 17.3, 15.2, 19.2, 26, 33.9,
21.5, 18.1, 21.4), cyl = c(8, 8, 8, 8, 4, 8, 8, 6, 4, 4, 8, 4,
6, 8, 8, 4, 8, 6, 6, 4, 4, 6, 6, 8, 8, 8, 8, 4, 4, 4, 6, 4),
disp = c(304, 472, 350, 440, 108, 318, 360, 145, 78.7, 79,
351, 75.7, 258, 360, 460, 95.1, 301, 160, 160, 140.8, 146.7,
167.6, 167.6, 275.8, 275.8, 275.8, 400, 120.3, 71.1, 120.1,
225, 121), hp = c(150, 205, 245, 230, 93, 150, 245, 175,
66, 66, 264, 52, 110, 175, 215, 113, 335, 110, 110, 95, 62,
123, 123, 180, 180, 180, 175, 91, 65, 97, 105, 109), drat = c(3.15,
2.93, 3.73, 3.23, 3.85, 2.76, 3.21, 3.62, 4.08, 4.08, 4.22,
4.93, 3.08, 3.15, 3, 3.77, 3.54, 3.9, 3.9, 3.92, 3.69, 3.92,
3.92, 3.07, 3.07, 3.07, 3.08, 4.43, 4.22, 3.7, 2.76, 4.11
), wt = c(3.435, 5.25, 3.84, 5.345, 2.32, 3.52, 3.57, 2.77,
2.2, 1.935, 3.17, 1.615, 3.215, 3.44, 5.424, 1.513, 3.57,
2.62, 2.875, 3.15, 3.19, 3.44, 3.44, 4.07, 3.73, 3.78, 3.845,
2.14, 1.835, 2.465, 3.46, 2.78), qsec = c(17.3, 17.98, 15.41,
17.42, 18.61, 16.87, 15.84, 15.5, 19.47, 18.9, 14.5, 18.52,
19.44, 17.02, 17.82, 16.9, 14.6, 16.46, 17.02, 22.9, 20,
18.3, 18.9, 17.4, 17.6, 18, 17.05, 16.7, 19.9, 20.01, 20.22,
18.6), vs = c(0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0,
1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1), am = c(0,
0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1), gear = c(3, 3, 3, 3,
4, 3, 3, 5, 4, 4, 5, 4, 3, 3, 3, 5, 5, 4, 4, 4, 4, 4, 4,
3, 3, 3, 3, 5, 4, 3, 3, 4), carb = c(2, 4, 4, 4, 1, 2, 4,
6, 1, 1, 4, 2, 1, 2, 4, 2, 8, 4, 4, 2, 2, 4, 4, 3, 3, 3,
2, 2, 1, 1, 1, 2)), .Names = c("mpg", "cyl", "disp", "hp",
"drat", "wt", "qsec", "vs", "am", "gear", "carb"), row.names = c("AMC Javelin:2.1.2.2.2",
"Cadillac Fleetwood:1.2.1.2.1", "Camaro Z28:1.2.2.1.2.2", "Chrysler Imperial:1.2.1.1",
"Datsun 710:2.2.2.2.1.2.2.2.1", "Dodge Challenger:2.1.2.2.1",
"Duster 360:1.2.2.1.2.1", "Ferrari Dino:2.2.2.1", "Fiat 128:2.2.1.2.2.1",
"Fiat X1-9:2.2.1.2.2.2", "Ford Pantera L:1.2.2.1.1", "Honda Civic:2.2.1.1",
"Hornet 4 Drive:2.1.1.1", "Hornet Sportabout:1.2.2.2.1", "Lincoln Continental:1.2.1.2.2",
"Lotus Europa:2.2.2.2.1.1", "Maserati Bora:1.1", "Mazda RX4:2.2.2.2.2.2.1.1",
"Mazda RX4 Wag:2.2.2.2.2.2.1.2", "Merc 230:2.2.2.2.1.2.1", "Merc 240D:2.2.2.2.2.1",
"Merc 280:2.2.2.2.2.2.2.1", "Merc 280C:2.2.2.2.2.2.2.2", "Merc 450SE:2.1.2.1.2.1",
"Merc 450SL:2.1.2.1.2.2", "Merc 450SLC:2.1.2.1.1", "Pontiac Firebird:1.2.2.2.2",
"Porsche 914-2:2.2.2.2.1.2.2.2.2.2", "Toyota Corolla:2.2.1.2.1",
"Toyota Corona:2.2.2.2.1.2.2.2.2.1", "Valiant:2.1.1.2", "Volvo 142E:2.2.2.2.1.2.2.1"
), class = "data.frame")
produces this:
> head(df)
mpg cyl disp hp drat wt qsec vs am gear carb
AMC Javelin:2.1.2.2.2 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
Cadillac Fleetwood:1.2.1.2.1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
Camaro Z28:1.2.2.1.2.2 13.3 8 350 245 3.73 3.840 15.41 0 0 3 4
Chrysler Imperial:1.2.1.1 14.7 8 440 230 3.23 5.345 17.42 0 0 3 4
Datsun 710:2.2.2.2.1.2.2.2.1 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Dodge Challenger:2.1.2.2.1 15.5 8 318 150 2.76 3.520 16.87 0 0 3 2
Note that the row names is delimited with ":". What I want to do is to split them
and the 2nd part becomes a new column of the data frame:
ancest mpg cyl disp hp drat wt qsec vs am gear carb
AMC Javelin 2.1.2.2.2 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
Cadillac Fleetwood 1.2.1.2.1 10.4 8 472 205 2.93 5.250 17.98 0 0 3 4
What's the way to do it?
I'm stuck with this:
rn <- rownames(df)
unlist(lapply(rn,strsplit,":"))
We can use strsplit to get the output in a "list", rbind the output to get a matrix "m1". Change the rownames of "df" by the first column and create a new column "ancest" with the second column of "m1"
m1 <- do.call(rbind, strsplit(rn, ':'))
row.names(df) <- m1[,1]
df['ancest'] <- m1[,2]
Or if you need the first column of the dataset as one of the split columns,
df1 <- cbind(ancest=m1[,2], df)
row.names(df1) <- m1[,1]
Or using splitstackshape and data.table
library(data.table)
library(splitstackshape)
df1 <- setDF(cSplit(setDT(df, keep.rownames=TRUE)[],
'rn', sep=":")[, c(12), with=FALSE])
rownames(df1) <- df1[,13]
df1 <- df1[-13]
Try this:
# create a new variable with the row names
df$names <- rownames((df)
# split the new variable into two pieces, delete the pattern (the :), and keep both pieces
df$names <- stringr::str_split_fixed(df$names, ":", 2)
Using the sapply and [ functions:
nm_plus_ancest <- rownames(df)
nm_plus_ancest_split <- strsplit(nm_plus_ancest, ":")
rownames(df) <- sapply(nm_plus_ancest_split, "[", 1)
df$ancest <- sapply(nm_plus_ancest_split, "[", 2)
And you can rearrange the columns with the nice dplyr::select function:
require(dplyr)
df <- select(df, ancest, mpg:carb)