Loop by variable names - r

I want to create a for loop by variable names.
Each time, I calculte the max between each two variables, and define a new one in data df. New variables look like this:var1_1, var1_2... Here is my code:
df=data.frame(matrix(c(1:6), nrow = 2))
colnames(df) = c("x", "y", "z")
for(i in length(names(df))-1){
df = df %>% mutate(paste0("var", i, "_", i+1) = max(names(df)[i], names(df)[i+1]))
}
But there gives error.
Expected output:
>df
x y z var1_2 var1_3 var2_3
1 3 5 3 5 5
2 4 6 4 6 6

One way via base R,
m1 <- sapply(combn(names(df),2, simplify = FALSE), function(i) do.call(pmax, df[i]))
nms <- combn(ncol(m1), 2, function(i) paste0('Var', i[1], '_', i[2]))
cbind(df, setNames(data.frame(m1), nms))
# x y z Var1_2 Var1_3 Var2_3
#1 1 3 5 3 5 5
#2 2 4 6 4 6 6

If you really want to use a Loop you can try:
ind<-combn(3,2)
for(i in 1:dim(df)[2]){
i <- ind[,i]
name <- paste0("var", i[1], "_", i[2])
val <- names(df)[i[ifelse(sum(df[,i[1]]) > sum(df[,i[2]]),1,2)]]
df <- mutate_(df, .dots= setNames(list(val),name))
}

Related

Change the names of multiple cells in R using a function

Consider the following data frame:
df <- setNames(data.frame(1:5,rep(1,5)), c("id", "value"))
I want to change the names for multiple cells in the column "id". Let's say I want to change the following:
df$id[df$id %In% 2:3] <- 1
df$id[df$id == 4] <- 3
However, instead of using the code above, I want to create a function, where I can do the transformation more "smooth" (because I have a lot of data frames, where I need to change the names for the cells). I want to create a function:
mapping <- function(...) {
...
}
where I afterward can create a simple and smooth mapping function for my df, where I only have to specific the "old" and the "new" names for the cells. Something like this:
df_mapping <- function(...) {
2.1
3.1
4.3
}
And then I can apply the function on my data and specific which column it should do it for, and it will work in the same way as the code with gsub:
df <- df_mapping(df,id)
Is it possible to create that mapping function?
if we need a function, then can have a 'data' argument, column name, values to replace and replacer value, then create the logical condition, subset the column, assign with replacer_val and return the dataset after the assignment
f1 <- function(dat, colnm, values_to_replace, replacer_val) {
dat[[colnm]][dat[[colnm]] %in% values_to_replace] <- replacer_val
return(dat)
}
f1(df, "id", c(2, 3), 1)
-output
# id value
#1 1 1
#2 1 1
#3 1 1
#4 4 1
#5 5 1
To replace values with corresponding sets of replacers,
f2 <- function(dat, colnm, values_to_replace, replacer_vals) {
nm1 <- setNames(replacer_vals, values_to_replace)
v1 <- nm1[as.character(dat[[colnm]])]
i1 <- !is.na(v1)
dat[[colnm]][i1] <- v1[i1]
return(dat)
}
f2(df, "id", c(2, 3), c(5, 6))
# id value
#1 1 1
#2 5 1
#3 6 1
#4 4 1
#5 5 1
Or another option is to create a key/value dataset and use merge or join
library(data.table)
f3 <- function(dat, colnm, values_to_replace, replacer_vals) {
keydat <- data.frame(key = values_to_replace, val = replacer_vals)
names(keydat)[1] <- colnm
dt <- as.data.table(dat)
dt[keydat, (colnm) := val, on = colnm][]
return(dt)
}
f3(df, "id", c(2, 5), c(3, 6))
Maybe a mapping like below could help
mapping <- function(df, id, to_replace, obj_value) {
transform(df, id = replace(id, id %in% to_replace, obj_value))
}
e.g.,
> mapping(df, id, c(2, 3), 1)
id value
1 1 1
2 1 1
3 1 1
4 4 1
5 5 1
You can use dplyr's recode function
mapping <- function(data, col, old, new) {
data[[col]] <- dplyr::recode(data[[col]], !!!setNames(new, old))
data
}
mapping(df, "id", c(2, 3), c(7L, 8L))
# id value
#1 1 1
#2 7 1
#3 8 1
#4 4 1
#5 5 1

Extracting and cbinding similarly named variables in a data.frame in R

I have a cbind of 2 data.frames called DATA. Using BASE R, I was wondering how I could extract and then, cbind similarly named variables in DATA and store them as a list?
For the example below, I want all variable AAs, and separately all variable BBs in DATA be separately cbinded and stored as a list?
Note: names could be anything, and the number of variables could be any number. A function(al) solution is highly appreciated.
Note: suppose we have NO ACCESS to r, the only input is DATA.
r <- list(
data.frame(Name = rep("Jacob", 6),
X = c(2,2,1,1,NA, NA),
Y = c(1,1,1,2,1,NA),
Z = rep(3, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,NA,NA),
Z = rep(2, 6),
out = rep(1, 6)),
data.frame(Name = rep("Jon", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(2, 6)),
data.frame(Name = rep("Jim", 6),
X = c(1,NA,3,1,NA,NA),
Y = c(1,1,1,2,2,NA),
Z = rep(2, 6),
out = rep(1, 6)))
DATA <- do.call(cbind, r) ## DATA: cbind of two data.frames
Here is an option with split. Wouldn't recommend to have same duplicate column names in the dataset. But, if it is really needed, after thee split, change the column names by removing the . following by one or more numbers at the end of it with sub
nm1 <- Reduce(intersect, lapply(r, colnames)) # get the common names
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Or if we need to use only 'DATA' and not 'r' for finding the intersecting column names. It is difficult but we can get a frequency of the occurence of column names and select that have 2 as frequency
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
Use that in the split.default as before
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
Using OP's new example
r <- list( data.frame( AA = c(2,2,1,1,3,2), BB = c(1,1,1,2,2,NA), CC = 1:6), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2)), data.frame( AA = c(1,NA,3,1,3,2), BB = c(1,1,1,2,2,2), DD = 0:5) )
DATA <- do.call(cbind, r)
tbl <- table(names(DATA))
nm1 <- names(which(tbl==max(tbl)))
lst1 <- split.default(DATA[names(DATA) %in% nm1], names(DATA)[names(DATA) %in% nm1])
lapply(lst1, function(x) setNames(x, sub("\\.\\d+$", "", names(x))))
#$AA
# AA AA AA
#1 2 1 1
#2 2 NA NA
#3 1 3 3
#4 1 1 1
#5 3 3 3
#6 2 2 2
#$BB
# BB BB BB
#1 1 1 1
#2 1 1 1
#3 1 1 1
#4 2 2 2
#5 2 2 2
#6 NA 2 2

Using rep inside sapply to strech a vector according to another vector

I want to generate a data.frame of edges. Problems arise when many edges end on one node. Edges are defined in vectors from and to.
# Data
vertices <- data.frame(id = 1:3, label = c("a", "b", "c"), stringsAsFactors = FALSE)
to <- c("a", "b", "c")
from1 <- c("c", "a", "b")
from2 <- c("c", "a", "a,b,c")
What I tried:
# Attempt 1
create_edges_1 <- function(from, to) {
to <- sapply(to, function(x){vertices$id[vertices$label == x]})
from <- sapply(from, function(x){vertices$id[vertices$label == x]})
data.frame(from = from, to = to, stringsAsFactors = FALSE)
}
This works for example create_edges_1(from1, to), the output is:
from to
c 3 1
a 1 2
b 2 3
However for example from2 this attempt fails.
So I tried the following:
# Attempt 2
create_edges_2 <- function(from, to) {
to <- sapply(unlist(sapply(strsplit(to, ","), function(x){vertices$id[vertices$label == x]})), function(x){rep(x, sapply(strsplit(from2, ","), length))})
from <- unlist(sapply(strsplit(from2, ","), function(x){vertices$id[vertices$label == x]}))
data.frame(from = from, to = to, stringsAsFactors = FALSE)
}
The idea was to "stretch" to for every node where more than one edge ends. However create_edges_2(from1, to) and create_edges_2(from2, to) both throw an error
Error in rep(x, sapply(strsplit(from2, ","), length)) :
invalid 'times' argument
What am I doing wrong in my sapply statements?
The expected output for create_edges_2(from2, to) is:
from to
3 1
1 2
1 3
2 3
3 3
You could use joins or match for this
f2 <- strsplit(from2, ',')
df <- data.frame(from = unlist(f2)
, to = rep(to, lengths(f2))
, stringsAsFactors = FALSE)
With match
library(tidyverse)
map_dfc(df, ~ with(vertices, id[match(.x, label)]))
# # A tibble: 5 x 2
# from to
# <int> <int>
# 1 3 1
# 2 1 2
# 3 1 3
# 4 2 3
# 5 3 3
With Joins
library(dplyr)
df %>%
inner_join(vertices, by = c(from = 'label')) %>%
inner_join(vertices, by = c(to = 'label')) %>%
select_at(vars(matches('.x|.y')))
# id.x id.y
# 1 3 1
# 2 1 2
# 3 1 3
# 4 2 3
# 5 3 3
Here is a way:
# Attempt 3
library(dplyr)
to <- sapply(to, function(x){vertices$id[vertices$label == x]})
from0 <- sapply(from2, function(x) strsplit(x, ",")) %>% unlist() %>% as.character()
lengths0 <- lapply(sapply(from2, function(x) strsplit(x, ",")), length) %>% unlist()
to0 <- c()
for( i in 1:length(lengths0)) to0 <- c(to0, rep(to[i], lengths0[i]))
from <- sapply(from0, function(x){vertices$id[vertices$label == x]})
edges <- data.frame(from = from, to = to0, stringsAsFactors = FALSE)
edges
Giving this result as requested:
from to
1 3 1
2 1 2
3 1 3
4 2 3
5 3 3
The idea is to split from with comma separators, and to store the size of each element in order to "stretch" every node. Here done with a for loop

Update existing data.frame with values from another one if missing

I'm looking for the (1) name and (2) a (cleaner) method in R (base and data.table preferred) of the following.
Input
> d1
id x y
1 1 1 NA
2 2 NA 3
3 3 4 NA
> d2
id x y z
1 4 NA 30 a
2 3 20 2 b
3 2 14 NA c
4 1 15 97 d
(note that the actual data.frames have hundreds of columns)
Expected output:
> d1
id x y z
1 1 1 97 d
2 2 14 3 c
3 3 4 2 b
Data and current solution:
d1 <- data.frame(id = 1:3, x = c(1, NA, 4), y = c(NA, 3, NA))
d2 <- data.frame(id = 4:1, x = c(NA, 20, 14, 15), y = c(30, 2, NA, 97), z = letters[1:4])
for (col in setdiff(names(d1), "id")) {
# If missing look in d2
missing <- is.na(d1[[col]])
d1[missing, col] <- d2[match(d1$id[missing], d2$id), col]
}
for (col in setdiff(names(d2), names(d1))) {
# If column missing then add
d1[[col]] <- d2[match(d1$id, d2$id), col]
}
PS:
Likely this questions has been asked before but I'm lacking in vocabulary to search it.
Assuming you are working with 2 data.frames, here is a base solution
#expand d1 to have the same columns as d2
d <- merge(d1, d2[, c("id", setdiff(names(d2), names(d1))), drop=FALSE],
by="id", all.x=TRUE, all.y=FALSE)
#make sure that d2 also have same number of columns as d1
d2 <- merge(d2, d1[, c("id", setdiff(names(d1), names(d2))), drop=FALSE],
by="id", all.x=TRUE, all.y=FALSE)
#align rows and columns to match those in d1
mask <- d2[match(d1$id, d2$id), names(d)]
#replace NAs with those mask
replace(d, is.na(d), mask[is.na(d)])
If you dont mind, we can rewrite your question into a general matrix-coalesce question (i.e. any number of matrices, columns, rows) which seems like it has not been asked before.
edit:
Another base R solution is a hack of coalesce1a from How to implement coalesce efficiently in R
coalesce.mat <- function(...) {
ans <- ..1
for (elt in list(...)[-1]) {
rn <- match(ans$id, elt$id)
ans[is.na(ans)] <- elt[rn, names(ans)][is.na(ans)]
}
ans
}
allcols <- Reduce(union, lapply(list(d1, d2), names))
do.call(coalesce.mat,
lapply(list(d1, d2), function(x) {
x[, setdiff(allcols, names(x))] <- NA
x
}))
edit:
a possible data.table solution using coalesce1a from How to implement coalesce efficiently in R by Martin Morgan.
coalesce1a <- function(...) {
ans <- ..1
for (elt in list(...)[-1]) {
i <- which(is.na(ans))
ans[i] <- elt[i]
}
ans
}
setDT(d1)
setDT(d2)
#melt into long formats and full outer join the 2
mdt <- merge(melt(d1, id.vars="id"), melt(d2, id.vars="id"), by=c("id","variable"), all=TRUE)
#perform a coalesce on vectors
mdt[, value := do.call(coalesce1a, .SD), .SDcols=grep("value", names(mdt), value=TRUE)]
#pivot into original format and subset to those in d1
dcast.data.table(mdt, id ~ variable, value.var="value")[
d1, .SD, on=.(id)]
Here is a possibility using dplyr::left_join:
left_join(d1, d2, by = "id") %>%
mutate(
x = ifelse(!is.na(x.x), x.x, x.y),
y = ifelse(!is.na(y.x), y.x, y.y)) %>%
select(id, x, y, z)
# id x y z
#1 1 1 97 d
#2 2 14 3 c
#3 3 4 2 b
We can use data.table with coalesce from dplyr. Create a vector of column names that are common ('nm1') and difference ('nm2') in both datasets. Convert the first dataset to 'data.table' (setDT(d1)), join on the 'id' column, assign (:=) the coalesced columns of the first and second (with prefix i. - if there are common columns) to update the values in the first dataset
library(data.table)
nm1 <- setdiff(intersect(names(d1), names(d2)), 'id')
nm2 <- setdiff(names(d2), names(d1))
setDT(d1)[d2, c(nm1, nm2) := c(Map(dplyr::coalesce, mget(nm1),
mget(paste0("i.", nm1))), mget(nm2)), on = .(id)]
d1
# id x y z
#1: 1 1 97 d
#2: 2 14 3 c
#3: 3 4 2 b

Concatenate rows and columns

I have a data set like this
x y z
a 5 4
b 1 2
And i want concat columns and rows :
ay 5
az 4
by 1
bz 2
Thanks
You can use melt, and paste but you will need to make your rownames a variable, i..e
df$new <- rownames(df)
m_df <- reshape2::melt(df)
rownames(m_df) <- paste0(m_df$new, m_df$variable)
m_df <- m_df[-c(1:2)]
m_df
# value
#ax 5
#bx 1
#ay 4
#by 2
#az 3
#bz 1
After your edit, you don't need to convert rownames to a variable so just,
m1_df <- reshape2::melt(df)
m1_df$new <- paste0(m1_df$x, m1_df$variable)
m1_df
# x variable value new
#1 a y 5 ay
#2 b y 1 by
#3 a z 4 az
#4 b z 2 bz
You can then tidy your data frame to required output
with dplyr-tidyr
library(dplyr)
library(tidyr)
df %>%
gather(var, val, -x) %>%
mutate(var=paste0(x, var)) %>%
select(var, val)%>%
arrange(var)
# var val
#1 ay 5
#2 az 4
#3 by 1
#4 bz 2
library(reshape2)
library(dplyr)
library(tibble)
library(stringr)
# Create dataframe
x <- data.frame(x = c(5, 1),
y = c(4, 2),
z = c(3, 1),
row.names = c('a', 'b'))
# Convert rowname to column and melt
x <- tibble::rownames_to_column(x, "rownames") %>%
melt('rownames')
# assign concat columns as rownames
row.names(x) <- str_c(x$rownames, x$variable)
# Select relevant columns only
x <- select(x, value)
# Remove names from dataframe
names(x) <- NULL
> x
ax 5
bx 1
ay 4
by 2
az 3
bz 1
Here is another option in base R
stack(setNames(as.list(unlist(df1[-1])), outer(df1$x, names(df1)[-1], paste0)))[2:1]

Resources