Related
I have the following dataset:
A..B A..C B..C
value 2 5 9
and I would like to break it in a way such as I get the following output:
A B C
A 1 2 5
B 2 1 9
C 5 9 1
in ideas on how can I do this in r?
Maybe you can try the base R code below
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
inds <- do.call(rbind, lapply(dn, function(x) rbind(x, rev(x))))
mat[inds] <- rep(unlist(df), each = 2)
or
dn <- strsplit(names(df), "..", fixed = TRUE)
mat <- `dimnames<-`(diag(rep(1, ncol(df))), replicate(2, list(unique(unlist(dn)))))
for (k in seq_along(dn)) {
mat[do.call(cbind, as.list(dn[[k]]))] <- df[, k]
}
mat[lower.tri(mat)] <- t(mat)[lower.tri(mat)]
such that
> mat
A B C
A 1 2 5
B 2 1 9
C 5 9 1
Data
> dput(df)
structure(list(A..B = 2L, A..C = 5L, B..C = 9L), class = "data.frame", row.names = "value")
An option with tidyverse
library(dplyr)
library(tidyr)
library(tibble)
df %>%
pivot_longer(cols = everything()) %>%
separate(name, into = c('name1', 'name2')) %>%
complete(name1 = LETTERS[1:3], name2 = LETTERS[1:3],
fill = list(value = 0)) %>%
pivot_wider(names_from = name2, values_from = value) %>%
column_to_rownames('name1') %>%
as.matrix %>%
{. + t(.)} %>%
`diag<-`(., 1)
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
data
df <- structure(list(A..B = 2L, A..C = 5L, B..C = 9L),
class = "data.frame", row.names = "value")
Here's another option that uses matrix indexing to fill in the values:
library(splitstackshape)
# stack your dataset and split the names into two columns
x <- cSplit(stack(df), "ind", "..")
# ij is going to be your index of row and column combinations
ij <- as.matrix(x[, 2:3])
u <- unique(c(ij))
# initialze a matrix of 1s
m <- matrix(1, nrow = length(u), ncol = length(u),
dimnames = list(u, u))
# replace the relevant indices with values
m[rbind(ij, ij[, 2:1])] <- x$values
m
# A B C
# A 1 2 5
# B 2 1 9
# C 5 9 1
In base you can use strsplit to get the names, use unique to get all levels and create a matrix initialized with 1L and the size of the levels. Then you can fill up the matrix by using the names to find the position of the values.
i <- do.call(rbind, strsplit(names(x), "..", TRUE))
u <- unique(as.vector(i))
m <- matrix(1L, length(u), length(u), dimnames = list(u, u))
m[rbind(i, i[,2:1])] <- unlist(x)
#m[rbind(i, i[,2:1])] <- x #Alternative in case of a vector
m
# A B C
#A 1 2 5
#B 2 1 9
#C 5 9 1
Data:
x <- data.frame(A..B = 2L, A..C = 5L, B..C = 9L, row.names = "value")
#x <- c(A..B = 2L, A..C = 5L, B..C = 9L) #Alternative as a vector
I have two data frames. dfOne is made like this:
X Y Z T J
3 4 5 6 1
1 2 3 4 1
5 1 2 5 1
and dfTwo is made like this
C.1 C.2
X Z
Y T
I want to obtain a new dataframe where there are simultaneously X, Y, Z, T Values which are major than a specific threshold.
Example. I need simultaneously (in the same row):
X, Y > 2
Z, T > 4
I need to use the second data frame to reach my objective, I expect something like:
dfTwo$C.1>2
so the result would be a new dataframe with this structure:
X Y Z T J
3 4 5 6 1
How could I do it?
Here is a base R method with Map and Reduce.
# build lookup table of thresholds relative to variable name
vals <- setNames(c(2, 2, 4, 4), unlist(dat2))
# subset data.frame
dat[Reduce("&", Map(">", dat[names(vals)], vals)), ]
X Y Z T J
1 3 4 5 6 1
Here, Map returns a list of length 4 with logical variables corresponding to each comparison. This list is passed to Reduce which returns a single logical vector with length corresponding to the number of rows in the data.frame, dat. This logical vector is used to subset dat.
data
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
dat2 <-
structure(list(C.1 = structure(1:2, .Label = c("X", "Y"), class = "factor"),
C.2 = structure(c(2L, 1L), .Label = c("T", "Z"), class = "factor")), .Names = c("C.1",
"C.2"), class = "data.frame", row.names = c(NA, -2L))
We can use the purrr package
Here is the input data.
# Data frame from lmo's solution
dat <-
structure(list(X = c(3L, 1L, 5L), Y = c(4L, 2L, 1L), Z = c(5L,
3L, 2L), T = c(6L, 4L, 5L), J = c(1L, 1L, 1L)), .Names = c("X",
"Y", "Z", "T", "J"), class = "data.frame", row.names = c(NA,
-3L))
# A numeric vector to show the threshold values
# Notice that columns without any requirements need NA
vals <- c(X = 2, Y = 2, Z = 4, T = 4, J = NA)
Here is the implementation
library(purrr)
map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) %>% na.omit()
# A tibble: 1 x 5
X Y Z T J
<int> <int> <int> <int> <int>
1 3 4 5 6 1
map2_dfc loop through each column in dat and each value in vals one by one with a defined function. ~ifelse(.x > .y | is.na(.y), .x, NA) means if the number in each column is larger than the corresponding value in vals, or vals is NA, the output should be the original value from the column. Otherwise, the value is replaced to be NA. The output of map2_dfc(dat, vals, ~ifelse(.x > .y | is.na(.y), .x, NA)) is a data frame with NA values in some rows indicating that the condition is not met. Finally, na.omit removes those rows.
Update
Here I demonstrate how to covert the dfTwo dataframe to the vals vector in my example.
First, let's create the dfTwo data frame.
dfTwo <- read.table(text = "C.1 C.2
X Z
Y T",
header = TRUE, stringsAsFactors = FALSE)
dfTwo
C.1 C.2
1 X Z
2 Y T
To complete the task, I load the dplyr and tidyr package.
library(dplyr)
library(tidyr)
Now I begin the transformation of dfTwo. The first step is to use stack function to convert the format.
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group))
dfTwo2
Col Group
1 X C.1
2 Y C.1
3 Z C.2
4 T C.2
The second step is to add the threshold information. One way to do this is to create a look-up table showing the association between Group and Value
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
threshold_df
Group Value
1 C.1 2
2 C.2 4
And then we can use the left_join function to combine the data frame.
dfTwo3 <- dfTwo2 %>% left_join(threshold_dt, by = "Group")
dfTwo3
Col Group Value
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
Now it is the third step. Notice that there is a column called J which does not need any threshold. So we need to add this information to dfTwo3. We can use the complete function from tidyr. The following code completes the data frame by adding Col in dat but not in dfTwo3 and NA to the Value.
dfTwo4 <- dfTwo3 %>% complete(Col = colnames(dat))
dfTwo4
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 J <NA> NA
2 T C.2 4
3 X C.1 2
4 Y C.1 2
5 Z C.2 4
The fourth step is arrange the right order of dfTwo4. We can achieve this by turning Col to factor and assign the level based on the order of the column name in dat.
dfTwo5 <- dfTwo4 %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
dfTwo5
# A tibble: 5 x 3
Col Group Value
<chr> <chr> <dbl>
1 X C.1 2
2 Y C.1 2
3 Z C.2 4
4 T C.2 4
5 J <NA> NA
We are almost there. Now we can create vals from dfTwo5.
vals <- dfTwo5$Value
names(vals) <- dfTwo5$Col
vals
X Y Z T J
2 2 4 4 NA
Now we are ready to use the purrr package to filter the data.
The aboved are the breakdown of steps. We can combine all these steps into the following code for simlicity.
library(dplyr)
library(tidyr)
threshold_df <- data.frame(Group = c("C.1", "C.2"),
Value = c(2, 4),
stringsAsFactors = FALSE)
dfTwo2 <- dfTwo %>%
stack() %>%
setNames(c("Col", "Group")) %>%
mutate(Group = as.character(Group)) %>%
left_join(threshold_df, by = "Group") %>%
complete(Col = colnames(dat)) %>%
mutate(Col = factor(Col, levels = colnames(dat))) %>%
arrange(Col) %>%
mutate(Col = as.character(Col))
vals <- dfTwo2$Value
names(vals) <- dfTwo2$Col
dfOne[Reduce(intersect, list(which(dfOne["X"] > 2),
which(dfOne["Y"] > 2),
which(dfOne["Z"] > 4),
which(dfOne["T"] > 4))),]
# X Y Z T J
#1 3 4 5 6 1
Or iteratively (so fewer inequalities are tested):
vals = c(X = 2, Y = 2, Z = 4, T = 4) # from #lmo's answer
dfOne[Reduce(intersect, lapply(names(vals), function(x) which(dfOne[x] > vals[x]))),]
# X Y Z T J
#1 3 4 5 6 1
I'm writing this assuming that the second DF is meant to categorize the fields in the first DF. It's way simpler if you don't need to use the second one to define the conditions:
dfNew = dfOne[dfOne$X > 2 & dfOne$Y > 2 & dfOne$Z > 4 & dfOne$T > 4, ]
Or, using dplyr:
library(dplyr)
dfNew = dfOne %>% filter(X > 2 & Y > 2 & Z > 4 & T > 4)
In case that's all you need, I'll save this comment while I poke at the more complicated version of the question.
I would like to setdiff between consecutive groups without for looping, if possible with a datatable way or a function of apply family.
Dataframe df :
id group
1 L1 1
2 L2 1
3 L1 2
4 L3 2
5 L4 2
6 L3 3
7 L5 3
8 L6 3
9 L1 4
10 L4 4
11 L2 5
I want to know how much new ids there are between consecutive groups. So, for example, if we compare group 1 and 2, there are two new ids : L3 and L4 so it returns 2 (not with setdiff directly but with length()), if we compare group 2 and 3, L5 and L6 are the news ids so it returns 2 and so on.
Expected results :
new_id
2
2
2
1
Data :
structure(list(id = structure(c(1L, 2L, 1L, 3L, 4L, 3L, 5L, 6L,
1L, 4L, 2L), .Label = c("L1", "L2", "L3", "L4", "L5", "L6"), class = "factor"),
group = c(1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 5)), class = "data.frame", row.names = c(NA,
-11L), .Names = c("id", "group"))
Here is an option with mapply:
lst <- with(df, split(id, group))
mapply(function(x, y) length(setdiff(y, x)), head(lst, -1), tail(lst, -1))
#1 2 3 4
#2 2 2 1
Here is a data.table way with merge. Suppose the original data.frame is named dt:
library(data.table)
setDT(dt)
dt2 <- copy(dt)[, group := group + 1]
merge(
dt, dt2, by = 'group', allow.cartesian = T
)[, .(n = length(setdiff(id.x, id.y))), by = group]
# group n
# 1: 2 2
# 2: 3 2
# 3: 4 2
# 4: 5 1
You could use Reduce to run a comparison function on pairwise elements in a list. For example
xx<-Reduce(function(a, b) {
x <- setdiff(b$id, a$id);
list(id=b$id, new=x, newcount=length(x))
}, split(df, df$group),
acc=TRUE)[-1]
Then you can get the counts of new elements out with
sapply(xx, '[[', "newcount")
and you can get the new values with
sapply(xx, '[[', "new")
L = split(d, d$group) #Split data ('d') by group and create a list
#use lapply to access 'id' for each sub group in the list and obtain setdiff
sapply(2:length(L), function(i)
setNames(length(setdiff(L[[i]][,1], L[[i-1]][,1])),
nm = paste(names(L)[i], names(L)[i-1], sep = "-")))
#2-1 3-2 4-3 5-4
# 2 2 2 1
I have the following input
#mydata
ID variable1 variable2
1 a,b,c,d c,a
2 g,f,h h
3 p,l,m,n,c c,l
I wish to subtract the strings of varible2 from variable1 and I'd like to have the following output?
#Output
ID Output
1 b,d
2 g,f
3 p,m,n
#dput
structure(list(ID = 1:3, variable1 = structure(1:3, .Label = c("a,b,c,d",
"g,f,h", "p,l,m,n,c"), class = "factor"), variable2 = structure(c(1L,
3L, 2L), .Label = c("c,a", "c,l", "h"), class = "factor")), .Names = c("ID",
"variable1", "variable2"), class = "data.frame", row.names = c(NA,
-3L))
You can try,
Map(setdiff, strsplit(as.character(df$variable1), ',')), strsplit(as.character(df$variable2), ','))
We can use Map after splitting each of the columns by , get the setdiff, paste them together, set the names of the list output with 'ID' column, stack it to 'data.frame' and set the names to 'ID' and 'Output' for the columns.
setNames(stack(setNames(Map(function(x,y) toString(setdiff(x,y)),
strsplit(as.character(df1$variable1), ","),
strsplit(as.character(df1$variable2), ",")),
df1$ID))[2:1], c("ID", "Output"))
# ID Output
#1 1 b, d
#2 2 g, f
#3 3 p, m, n
Or a compact option would be
library(splitstackshape)
cSplit(df1, 2:3, ",", "long")[, .(Output = toString(setdiff(variable1, variable2))) , ID]
# ID Output
#1: 1 b, d
#2: 2 g, f
#3: 3 p, m, n
Using grepl instead of setdiff
library(stringr)
a1 <- str_split(d$variable1, ",")
a2 <- str_split(d$variable2, ",")
do.call("rbind",Map(function(x,y) paste(x[!grepl(paste(y, collapse="|"), x)], collapse=","), a1, a2))
[,1]
[1,] "b,d"
[2,] "g,f"
[3,] "p,m,n"
Using Dplyr
mydata %>%
rowwise() %>%
mutate(output = paste0(setdiff(strsplit(as.character(variable1),split = ",")[[1]], strsplit(as.character(variable2),",")[[1]] ),collapse = ","))
%>% select(ID,output)
output:
ID output
(int) (chr)
1 1 b,d
2 2 g,f
3 3 p,m,n
I have 19 nested lists generated from a lapply and split operation.
These lists are in the form:
#list1
Var col1 col2 col3
A 2 3 4
B 3 4 5
#list2
Var col1 col2 col3
A 5 6 7
B 5 4 4
......
#list19
Var col1 col2 col3
A 3 6 7
B 7 4 4
I have been able to merge the lists with
merge.all <- function(x, y) merge(x, y, all=TRUE, by="Var")
out <- Reduce(merge.all, DataList)
I am however getting an error due to the similarity in the names of the other columns.
How can I concatenate the name of the list to the variable names so that I get something like this:
Var list1.col1 list1.col2 list1.col3 .......... list19.col3
A 2 3 4 7
B 3 4 5 .......... 4
I'm really sure somebody will come up with a much, much better solution. However, if you're after a quick and dirty solution, this seems to work.
My plan was to simply change the column names prior to merging.
#Sample Data
df1 <- data.frame(Var = c("A","B"), col1 = c(2,3), col2 = c(3,4), col3 = c(4,5))
df2 <- data.frame(Var = c("A","B"), col1 = c(5,5), col2 = c(6,4), col3 = c(7,5))
df19 <- data.frame(Var = c("A","B"), col1 = c(3,7), col2 = c(6,4), col3 = c(7,4))
mylist <- list(df1, df2, df19)
names(mylist) <- c("df1", "df2", "df19") #just manually naming, presumably your list has names
## Change column names by pasting name of dataframe in list with standard column names. - using ugly mix of `lapply` and a `for` loop:
mycolnames <- colnames(df1)
mycolnames1 <- lapply(names(mylist), function(x) paste0(x, mycolnames))
for(i in 1:length(mylist)){
colnames(mylist[[i]]) <- mycolnames1[[i]]
colnames(mylist[[i]])[1] <- "Var" #put Var back in so you can merge
}
## Merge
merge.all <- function(x, y)
merge(x, y, all=TRUE, by="Var")
out <- Reduce(merge.all, mylist)
out
# Var df1col1 df1col2 df1col3 df2col1 df2col2 df2col3 df19col1 df19col2 df19col3
#1 A 2 3 4 5 6 7 3 6 7
#2 B 3 4 5 5 4 5 7 4 4
There you go - it works but is very ugly.
To set the data frame names unique, you could use a function to set all list names that are not the merging variable to unique names.
resetNames <- function(x, byvar = "Var") {
asrl <- as.relistable(lapply(x, names))
allnm <- names(unlist(x, recursive = FALSE))
rpl <- replace(allnm, unlist(asrl) %in% byvar, byvar)
Map(setNames, x, relist(rpl, asrl))
}
Reduce(merge.all, resetNames(dlist))
# Var list1.col1 list1.col2 list1.col3 list2.col1 list2.col2 list2.col4 list3.col1
#1 A 2 3 4 5 6 7 3
#2 B 3 4 5 5 4 4 7
# list3.col2 list3.col3 list4.col1 list4.col2 list4.col3
#1 6 7 3 6 7
#2 4 4 4 5 6
when run your list with an added data frame there are no warnings. And there's always data table. Its merge method does not return a warning for duplicated column names.
library(data.table)
Reduce(merge.all, lapply(dlist, as.data.table))
Another option is to check the names as the data enters the function, change them there, and then you can avoid the warning. This isn't perfect but it works ok here.
merge.all <- function(x, y) {
m <- match(names(y)[-1], gsub("[.](x|y)$", "", names(x)[-1]), 0L)
names(y)[-1][m] <- paste0(names(y)[-1][m], "DUPE")
merge(x, y, all=TRUE, by="Var")
}
rm <- Reduce(merge.all, dlist)
names(rm)
# [1] "Var" "col1" "col2" "col3" "col1DUPE.x"
# [6] "col2DUPE.x" "col4" "col1DUPE.y" "col2DUPE.y" "col3DUPE.x"
# [11] "col1DUPE" "col2DUPE" "col3DUPE.y"
where dlist is
structure(list(list1 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = 2:3, col2 = 3:4, col3 = 4:5), .Names = c("Var",
"col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-2L)), list2 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = c(5L, 5L), col2 = c(6L, 4L),
col4 = c(7L, 4L)), .Names = c("Var", "col1", "col2", "col4"
), class = "data.frame", row.names = c(NA, -2L)), list3 = structure(list(
Var = structure(1:2, .Label = c("A", "B"), class = "factor"),
col1 = c(3L, 7L), col2 = c(6L, 4L), col3 = c(7L, 4L)), .Names = c("Var",
"col1", "col2", "col3"), class = "data.frame", row.names = c(NA,
-2L)), list4 = structure(list(Var = structure(1:2, .Label = c("A",
"B"), class = "factor"), col1 = 3:4, col2 = c(6L, 5L), col3 = c(7L,
6L)), .Names = c("Var", "col1", "col2", "col3"), row.names = c(NA,
-2L), class = "data.frame")), .Names = c("list1", "list2", "list3",
"list4"))