I have 50 dataframes that all have the same column names (e.g. df1: colnames = Id, A,B,C,D, df2: colnames = ID, A,B,C,D and so on).
I need to rename these so it becomes df1: colnames = ID, Mth1_A, Mth1_B, Mth1_C, Mth1_D and then df2: ID, Mth2_A, Mth2_B, Mth2_C, Mth2_D. So each column name should correspond to the number of the dataframe.
I've created a function that does this;
col_prefix <- function(df, Mth){
colnames(df)[2:ncol(df)] <- paste("Mth", colnames(df)[2:ncol(df)], sep = "_")
return(df)
}
But I'm now trying to create a loop to do it for all 50 and I can't get it to work. This is what I've got so far
dfList <- c("df1", "df2",...,"df50")
for (filename in dfList){
i <- get(filename)
i <- col_prefix(i, Mth)
}
Its adding the prefix "Mth" to the datafarmes but its not doing "Mth1", "Mth2", etc. I'm fairly sure this is because in my function Mth is a character but I don't know how to loop through this.
Please help!
Put them in a list and use their name (df1, df2, etc...)to catch the prefix, i.e.
l1 <- mget(grep(pattern = "df[0-9]+", x = ls(), value = TRUE))
Map(function(x, y) setNames(x, paste0('MTH', gsub('\\D+', '', y), '_', names(x))),
l1, names(l1))
$df1
MTH1_v1 MTH1_v2
1 5 9
2 6 10
3 7 11
$df2
MTH2_v1 MTH2_v2
1 15 19
2 16 110
3 17 111
To change all names except the first one then,
Map(function(x, y) data.frame(x[1], setNames(x[-1], paste0('MTH', gsub('\\D+', '', y), '_', names(x)[-1]))), l1, names(l1))
$df1
v1 MTH1_v2
1 5 9
2 6 10
3 7 11
$df2
v1 MTH2_v2
1 15 19
2 16 110
3 17 111
DATA
dput(df1)
structure(list(v1 = c(5, 6, 7), v2 = c(9, 10, 11)), class = "data.frame", row.names = c(NA,
-3L))
dput(df2)
structure(list(v1 = c(15, 16, 17), v2 = c(19, 110, 111)), class = "data.frame", row.names = c(NA,
-3L))
Related
This question already has an answer here:
lapply and mutate_all/for loops
(1 answer)
Closed 2 years ago.
Sample data:
dat1 <- structure(list(id = 1:3, des.1 = 4:6, x = 7:9, not = 10:12), class = "data.frame", row.names = c(NA,-3L))
dat2 <- structure(list(id = 1:3, descript = 4:6, y = 7:9, yes = 10:12), class = "data.frame", row.names = c(NA,-3L))
dat3 <- structure(list(id = 1:3, description = 4:6, x = 7:9, X4 = 10:12), class = "data.frame", row.names = c(NA,-3L))
dat1[1,2] <- "ERROR"
dat2[2,1] <- "ERROR"
dat_list <- list(dat1, dat2, dat3)
How can I set all instances of 'ERROR' to 0 within this list of dataframe? If possible, a plyr solution would be preferred.
Many thanks.
You can use map to iterate over list :
library(dplyr)
library(purrr)
map(dat_list, ~.x %>% mutate_all(~replace(., . == 'ERROR', 0)) %>% type.convert)
In new dplyr you can use across :
map(dat_list, ~.x %>%
mutate(across(everything(), ~replace(., . == 'ERROR', 0))) %>%
type.convert)
In base R, we can use lapply :
lapply(dat_list, function(x) {x[x == 'ERROR'] <- 0;type.convert(x)})
#[[1]]
# id des.1 x not
#1 1 0 7 10
#2 2 5 8 11
#3 3 6 9 12
#[[2]]
# id descript y yes
#1 1 4 7 10
#2 0 5 8 11
#3 3 6 9 12
#[[3]]
# id description x X4
#1 1 4 7 10
#2 2 5 8 11
#3 3 6 9 12
I have a list containing a number of data frames, all with the same number of columns.
E.g, for a list df_list with two data frames, df1 and df2:
>df_list
df1
a b c
1 1 1
2 2 2
3 3 3
df2
a b c
3 2 1
3 2 1
3 2 1
I want to rename the headers of every data frame to new_headings <- c("A", "B", "C").
I constructed a for loop:
for (i in 1:length(list)) {
names(list[[i]]) <- new_headings
}
However, this doesn't work. The headings remain as they were. If I do it individually instead of in a loop, it works fine, however, e.g., names(list[[1]]) <- new_headings changes the headings appropriately.
My actual list is very long with many data frames. Can anyone explain why this isn't working or what other approach I can use? Thank you.
We can use Map with setNames
df_listNew <- Map(setNames, df_list, list(new_headings))
Or using lapply
lapply(df_list, setNames, new_headings)
#$df1
# A B C
#1 1 1 1
#2 2 2 2
#3 3 3 3
#$df2
# A B C
#1 3 2 1
#2 3 2 1
#3 3 2 1
data
df_list <- list(df1 = structure(list(a = 1:3, b = 1:3, c = 1:3),
class = "data.frame", row.names = c(NA,
-3L)), df2 = structure(list(a = c(3, 3, 3), b = c(2, 2, 2), c = c(1,
1, 1)), class = "data.frame", row.names = c(NA, -3L)))
You can use two for loops
a<-c(1,2,3)
b<-c(1,2,3)
c<-c(1,2,3)
df1<-as.data.frame(cbind(a,b,c))
a<-c(3,2,1)
b<-c(3,2,1)
c<-c(3,2,1)
df2<-as.data.frame(cbind(a,b,c))
df_list<-list(df1,df2)
new_headings <- c("A", "B", "C")
for (i in 1:length(df_list)) {
for (j in 1:length(df_list[[i]])) {
colnames(df_list[[i]])[j] <- new_headings[j]
}
}
df_list
I have a cbind of 2 data.frames called DATA. Using BASE R, I was wondering how I could extract and then, cbind similarly named variables in DATA and store them as a list?
For the example below, I want all variable AAs, and separately all variable BBs in DATA be separately cbinded and stored as a list?
Note: names could be anything, and the number of variables could be any number. A function(al) solution is highly appreciated.
Note: suppose we have NO ACCESS to r, the only input is DATA.
r <- list(
data.frame(Name = rep("Jacob", 6),
X = c(2,2,1,1,NA, NA),
Y = c(1,1,1,2,1,NA),
Z = rep(3, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,NA,NA),
Z = rep(2, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(2, 6)),
data.frame(Name = rep("Jim", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(1, 6)))
DATA <- do.call(cbind, r) ## DATA: cbind of two data.frames
Here is an option with split. Wouldn't recommend to have same duplicate column names in the dataset. But, if it is really needed, after thee split, change the column names by removing the . following by one or more numbers at the end of it with sub
nm1 <- Reduce(intersect, lapply(r, colnames)) # get the common names
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Or if we need to use only 'DATA' and not 'r' for finding the intersecting column names. It is difficult but we can get a frequency of the occurence of column names and select that have 2 as frequency
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
Use that in the split.default as before
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Using OP's new example
r <- list( data.frame( AA = c(2,2,1,1,3,2), BB = c(1,1,1,2,2,NA), CC = 1:6), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2)), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2), DD = 0:5) )
DATA <- do.call(cbind, r)
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
#$AA
# AA AA AA
#1 2 1 1
#2 2 NA NA
#3 1 3 3
#4 1 1 1
#5 3 3 3
#6 2 2 2
#$BB
# BB BB BB
#1 1 1 1
#2 1 1 1
#3 1 1 1
#4 2 2 2
#5 2 2 2
#6 NA 2 2
I have 5 data sets, each containing some columns. The data sets have common column names, but all columns are not present in all the data sets. So whenever a column name (that appears in at least one of the data set) is not present in some other data set, I want to create a column of all zeros with that column name in that data set. So that all the data sets have same number of columns (and same column names).
Put the dataframes in the list, get the all the unique column names present in all the dataframes combined and add columns which are absent in each dataframe with 0.
all_names <- unique(unlist(sapply(list_df, names)))
lst1 <- lapply(list_df, function(x) {x[setdiff(all_names, names(x))] <- 0;x})
lst1
#[[1]]
# a b c
#1 1 6 0
#2 2 7 0
#3 3 8 0
#4 4 9 0
#5 5 10 0
#[[2]]
# a c b
#1 1 6 0
#2 2 7 0
#3 3 8 0
#4 4 9 0
#5 5 10 0
#[[3]]
# a c b
#1 1 6 11
#2 2 7 12
#3 3 8 13
#4 4 9 14
#5 5 10 15
If you need separate dataframes you can use lst1[[1]], lst1[[2]] individually again.
data
df1 <- data.frame(a = 1:5, b = 6:10)
df2 <- data.frame(a = 1:5, c = 6:10)
df3 <- data.frame(a = 1:5, c = 6:10, b = 11:15)
list_df <- list(df1, df2, df3)
We can use a for loop to do this
un1 <- Reduce(union, lapply(lst1, names))
for(i in seq_along(lst1)) lst1[[i]][setdiff(un1, names(lst1[[i]]))] <- 0
data
lst1 <- list(structure(list(a = 1:5, b = 6:10, c = c(0, 0, 0, 0, 0)),
row.names = c(NA,
-5L), class = "data.frame"), structure(list(a = 1:5, c = 6:10,
b = c(0, 0, 0, 0, 0)),
row.names = c(NA, -5L), class = "data.frame"),
structure(list(a = 1:5, c = 6:10, b = 11:15),
class = "data.frame", row.names = c(NA,
-5L)))
I would use dplyr's bind_rows, which automatically fills missing values with NA. If you include .id = "df_id" a column will be added connecting each row to the original dataframe:
library(dplyr)
bind_rows(df1, df2, df3, .id = "df_id")
#### OUTPUT ####
df_id x y z
1 1 1 2 NA
2 2 3 NA 4
3 3 NA 5 6
If you want 0s instead of NAs just runt df[is.na(df)] <- 0. If you want a more informative df_id column you can pass in a named list:
bind_rows(list(df1 = df1, df2 = df2, df3 = df3), .id = "df_id")
#### OUTPUT ####
df_id x y z
1 df1 1 2 NA
2 df2 3 NA 4
3 df3 NA 5 6
If you want your dataframes separate then simply split by df_id, which generates a list of dataframes:
df <- bind_rows(df1, df2, df3, .id = "df_id")
split(df, df$df_id)
#### OUTPUT ####
$`1`
df_id x y z
1 1 1 2 NA
$`2`
df_id x y z
2 2 3 NA 4
$`3`
df_id x y z
3 3 NA 5 6
Data:
df1 <- data.frame(x = 1, y = 2)
df2 <- data.frame(x = 3, z = 4)
df3 <- data.frame(y = 5, z = 6)
In addition to the previous answers, you can use the bind_rows function in order to quickly combine all your data frames, which will take care of differences in column names:
library(dplyr)
x <- data.frame(
a = 1:3,
b = 4:6
)
y <- data.frame(
a = 4:7
)
z <- data.frame(
c = 8:10
)
xyz <- bind_rows(x, y, z)
xyz %>% replace(., is.na(.), 0)
I have a data.frame
data
data = structure(list(mystring = c("AASDAASADDLKJLKADDLKKLLKJLJADDLJLKJLADLKLADD",
"ASDSDFJSKADDKJSJKDFKSADDLKJFLAK"), class = c("cat", "dog")), .Names = c("mystring",
"class"), row.names = c(NA, -2L), class = "data.frame")
which looks like
#> dtt1
# mystring class
#1 AASDAASADDLKJLKADDLKKLLKJLJADDLJLKJLADLKLADD cat
#2 ASDSDFJSKADDKJSJKDFKSADDLKJFLAK dog
I am searching the start and end positions of a pattern "ADD" with in the first 20 characters in the strings under mystring considering class as the group.
I am doing this using str_locate of stringr package. Here is my attempt
setDT(dtt1)[,
cbind(list(str_locate_all(substr(as.character(mystring), 1, 20),"ADD")[[1]][,1]),
list(str_locate_all(substr(as.character(mystring), 1, 20),"ADD")[[1]][,2])),
by = class]
This gives my desired output
# class V1 V2
#1: cat 8 10
#2: cat 16 18
#3: dog 10 12
Question:
I would like to know if this is a standard approach or this can be done in a more efficient manner. str_locate gives the start and end positions of the matched pattern in separate columns, and I am putting them in separate list to cbind them together with the data.table? Also how can I specify the colnames for the cbinded columns here?
I think you first should reduce your operations per group, so I would first create a substring for all groups at once.
setDT(data)[, submystring := .Internal(substr(mystring, 1L, 20L))]
Then, using the stringi package (I don't like wrappers), you could do (though can't currently vouch for efficiency)
library(stringi)
data[, data.table(matrix(unlist(stri_locate_all_fixed(submystring, "ADD")), ncol = 2)), by = class]
# class V1 V2
# 1: cat 8 10
# 2: cat 16 18
# 3: dog 10 12
Alternatively, you could avoid matrix and data.table calls per group but spread the data after all the location were detected
res <- data[, unlist(stri_locate_all_fixed(submystring, "ADD")), by = class]
res[, `:=`(varnames = rep(c("V1", "V2"), each = .N/2), MatchCount = rep(1:(.N/2), .N/2)), by = class]
dcast(res, class + MatchCount ~ varnames, value.var = "V1")
# class MatchCount V1 V2
# 1: cat 1 8 10
# 2: cat 2 16 18
# 3: dog 1 10 12
Third similar option could be to try first run stri_locate_all_fixed over the whole data set and only then to unlist per group (instead of running both and unlist and stri_locate_all_fixed per group)
res <- data[, .(stri_locate_all_fixed(submystring, "ADD"), class = class)]
res[, N := lengths(V1)/2L]
res2 <- res[, unlist(V1), by = "class,N"]
res2[, `:=`(varnames = rep(c("V1", "V2"), each = N[1L]), MatchCount = rep(1:(N[1L]), N[1L])), by = class]
dcast(res2, class + MatchCount ~ varnames, value.var = "V1")
# class MatchCount V1 V2
# 1: cat 1 8 10
# 2: cat 2 16 18
# 3: dog 1 10 12
We could change the matrix output from str_locate_all to data.frame and use rbindlist to create the columns.
setDT(data)[,rbindlist(lapply(str_locate_all(substr(mystring, 1, 20),
'ADD'), as.data.frame)) , class]
# class start end
#1: cat 8 10
#2: cat 16 18
#3: dog 10 12
Here's how I did it.
library(stringi)
library(dplyr)
library(magrittr)
data = structure(list(mystring = c("AASDAASADDLKJLKADDLKKLLKJLJADDLJLKJLADLKLADD",
"ASDSDFJSKADDKJSJKDFKSADDLKJFLAK"), class = c("cat", "dog")), .Names = c("mystring",
"class"), row.names = c(NA, -2L), class = "data.frame")
my_function = function(row)
row$mystring %>%
stri_sub(to = 20) %>%
stri_locate_all_fixed(pattern = "ADD") %>%
extract2(1) %>%
as_data_frame
test =
data %>%
group_by(mystring) %>%
do(my_function(.)) %>%
left_join(data)