Need help in flagging number of Dependents and Precedents in R. My data frame contains some formulas (strings) and I want to add "col3" which should contain: 0 for A1, 1 for A2 (Because A2 is dependent on A1 - One dependency) and 2 for A3 (Because A3 is dependent on A2/A1).
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A4+Y3+A2', 'Y5+A1','A2+A1+A3','A2+A1')
df <- data.frame(col1, col2, stringsAsFactors=F)
My Output should look like:
col1 col2 col3
1 A1 Y1 0
2 A2 A1+Y2 1
3 A3 A4+Y3+A2 5
4 A6 Y5+A1 1
5 A4 A2+A1+Y3 3
6 A7 A2+A1 3
I have a data frame with 100+ rows of this format. Appreciate if you could help with this.
Below code produces the correct output.
col0 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col0, col2, stringsAsFactors=F)
library(tidyr)
library(dplyr)
df1 <- df %>%
separate(col2, into = as.character(c(1:4)),sep = "\\+") %>%
replace(is.na(.),"")
df1$OOE <- 0
for (i in 1:nrow(df1)) {
for (j in 2:ncol(df1)) {
for (k in 1:nrow(df1)) {
if (df1[i,j] == df1$col0[k]) df1$OOE[i]=df1$OOE[k]+df1$OOE[i]+1
}
}
}
col0 1 2 3 4 OOE
1 A1 X1 Y1 0
2 A2 A1 Y2 1
3 A3 A1 Y3 A2 3
4 A6 Y5 A2 2
5 A4 A2 A1 A3 7
6 A7 A2 A3 6
If AX can have a dependency on AY where Y>X, we need a tree like structure to find the dependencies. I knew about the igraph package but it seems to complex for the task. We just need some reference semantics and after some research, data.tree package seems appropriate. Here is the code:
col1 <- c('A1','A2','A3', 'A6','A4','A7')
col2 <- c('X1+Y1','A1+Y2', 'A1+Y3+A2', 'Y5+A2','A2+A1+A3','A2+A3')
df <- data.frame(col1, col2, stringsAsFactors=F)
require(data.tree)
# Create the graph/forest based on the data
getForest <- function(data) {
res <- new.env()
for( i in 1:nrow(data)){
nname <- data$col1[i]
if(!exists(nname,where=res))
assign(nname,Node$new(nname), pos=res)
par <- get(nname, envir=res)
print(par)
#Add the childs
deps <- unlist(regmatches(data$col2[i],gregexpr("A\\d+",data$col2[i])))
for( ch in deps) {
print("Ammm")
if(!exists(ch, where=res))
assign(ch,Node$new(ch), pos=res)
child <- get(ch, envir=res)
par$AddChildNode(child)
}
}
#Return the nodes
res
}
f <- getForest(df)
# Function to get the dependency level
getLevel<- function(node) {
if (node$count == 0)
return (0)
else {
return (length(node$children)+sum(sapply(node$children,getlevel)))
}
}
#Add dependency level to data frame
df$col3 <- sapply(df$col1, function(x) {getLevel(get(x,f))})
df
# col1 col2 col3
#1 A1 X1+Y1 0
#2 A2 A1+Y2 1
#3 A3 A1+Y3+A2 3
#4 A6 Y5+A2 2
#5 A4 A2+A1+A3 7
#6 A7 A2+A3 6
Related
I'm trying to cbind or unnest or as.data.table a partially nested list.
id <- c(1,2)
A <- c("A1","A2","A3")
B <- c("B1")
AB <- list(A=A,B=B)
ABAB <- list(AB,AB)
nested_list <- list(id=id,ABAB=ABAB)
The length of id is the same as ABAB (2 in this case). I don't know how to unlist a part of this list (ABAB) and cbind another part (id). Here's my desired result as a data.table:
data.table(id=c(1,1,1,2,2,2),A=c("A1","A2","A3","A1","A2","A3"),B=rep("B1",6))
id A B
1: 1 A1 B1
2: 1 A2 B1
3: 1 A3 B1
4: 2 A1 B1
5: 2 A2 B1
6: 2 A3 B1
I haven't tested for more general cases, but this works for the OP example:
library(data.table)
as.data.table(nested_list)[, lapply(ABAB, as.data.table)[[1]], id]
# id A B
#1: 1 A1 B1
#2: 1 A2 B1
#3: 1 A3 B1
#4: 2 A1 B1
#5: 2 A2 B1
#6: 2 A3 B1
Or another option (which is probably faster, but is more verbose):
rbindlist(lapply(nested_list$ABAB, as.data.table),
idcol = 'id')[, id := nested_list$id[id]]
This is some super ugly base R, but produces the desired output.
Reduce(rbind, Map(function(x, y) setNames(data.frame(x, y), c("id", "A", "B")),
as.list(nested_list[[1]]),
lapply(unlist(nested_list[-1], recursive=FALSE),
function(x) Reduce(cbind, x))))
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
lapply takes the a list of two elements (each containing the A and B variables) extracted with unlist and recursive=FALSE. It returns a list of character matrices with the B variable filled in by recycling. A list of the individual id variables from as.list(nested_list[[1]]) and the lit of matrices are fed to Map which converts corresponding pairs to a data.frame and gives the columns the desired names and returns a list of data.frames. Finally, this list of data.frames is fed to Reduce, which rbinds the results to a single data.frame.
The final Reduce(rbind, could be replaced by data.tables rbindlist if desired.
Here's another hideous solution
max_length = max(unlist(lapply(nested_list, function(x) lapply(x, lengths))))
data.frame(id = do.call(c, lapply(nested_list$id, rep, max_length)),
do.call(rbind, lapply(nested_list$ABAB, function(x)
do.call(cbind, lapply(x, function(y) {
if(length(y) < max_length) {
rep(y, max_length)
} else {
y
}
})))))
# id A B
#1 1 A1 B1
#2 1 A2 B1
#3 1 A3 B1
#4 2 A1 B1
#5 2 A2 B1
#6 2 A3 B1
And one more, also inelegant- but I`d gone too far by the time I saw the other answers.
restructure <- function(nested_l) {
ids <- as.numeric(max(unlist(lapply(unlist(nested_l, recursive = FALSE), function(x){
lapply(x, length)
}))))
temp = data.frame(rep(nested_l$id, each = ids),
sapply(1:length(nested_l$id), function(x){
out <-unlist(lapply(nested_l[[2]], function(y){
return(y[x])
}))
}))
names(temp) <- c("id", unique(substring(unlist(nested_l[2]), first = 1, last = 1)))
return(temp)
}
> restructure(nested_list)
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
Joining the party:
library(tidyverse)
temp <- map(nested_list,~map(.x,~expand.grid(.x)))
df <- map_df(1:2,~cbind(temp$id[[.x]],temp$ABAB[[.x]]))
Var1 A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
I'm new to this but I'm pretty sure this question hasn't been answered, or I'm just not good at searching....
I would like to subtract the values in multiple rows from a particular row based on matching columns and values. My actual data will be a large matrix with >5000 columns, eaching needing to be subtracted by a blank value that matches the a value in a factor column.
Here is an example data table:
c1 c2 c3 c4 c5
r1 A 1 2 3 aa
r2 B 2 3 4 bb
r3 C 3 4 5 aa
r4 D 4 1 6 bb
r5 Blank 2 3 4 aa
r6 Blank 3 4 5 bb
I would like to subtract the c2,c3,and c4 values of c1 ="Blank" row from A,B,and C using the c5 factor to define which Blank values are used (aa or bb). I would like the "Blank" values to be subtracted from all rows sharing c5 info.
(i know this is confusing to describe)
So the results would look like this:
c1 c2 c3 c4 c5
r1 A -1 -1 -1 aa
r2 B -1 -1 -1 bb
r3 C 1 1 1 aa
r4 D 1 -3 1 bb
I've seen the ddply function work for doing something like this with a single column, but I wasn't able to expand that to perform this task for multiple columns. I'm a noob though...
Thank you for your help!
This is not tested for all possible cases, but should give you an idea:
df <- read.table(text =
"c1 c2 c3 c4 c5
r1 A 1 2 3 aa
r2 B 2 3 4 bb
r3 C 3 4 5 aa
r4 D 4 1 6 bb
r5 Blank 2 3 4 aa
r6 Blank 3 4 5 bb", header = T)
library(data.table)
# separate dataset into two
dt <- data.table(df, key = "c5")
dt.blank <- dt[c1 == "Blank"]
dt <- dt[c1 != "Blank"]
# merge into resulting dataset
dt.res <- dt[dt.blank]
# update each column
columns.count <- ncol(dt)
for(i in 2:(columns.count-1)) {
dt.res[[i]] <- dt.res[[i]] - dt.res[[i + columns.count]]
}
# > dt.res
# c1 c2 c3 c4 c5 i.c1 i.c2 i.c3 i.c4
# 1: A -1 -1 -1 aa Blank 2 3 4
# 2: C 1 1 1 aa Blank 2 3 4
# 3: B -1 -1 -1 bb Blank 3 4 5
# 4: D 1 -3 1 bb Blank 3 4 5
First split your data, since there's no reason you have them in a single data structure. Then apply the function:
# recreate your data
df <- data.frame(rbind(c(1:3, "aa"), c(2:4, "bb"), c(3:5, "aa"), c(4,1,6, "bb"), c(2:4, "aa"), c(3:5, "bb")))
df[,1:3] <- apply(df[,1:3], 2, as.integer)
# split it
blank1 <- df[5,]
blank2 <- df[6,]
df <- df[1:4,]
for (i in 1:nrow(df)) {
if (df[i,4] == "aa") {df[i,1:3] <- df[i,1:3] - blank1[1:3]}
else {df[i,1:3] <- df[i,1:3] - blank2[1:3]}
}
There are a few different was to run the loop, including vectorizing. But this suffices. I'd also argue that there's no reason to keep the labels "aa" v "bb" in the initial data structure either, which would make this simpler; but it's your choice.
I would like to compare two columns simultaneously. My data looks like this:
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,1,3,1,3), "b2" = c(2,2,1,2,1))
cbind(a, b)
# a1 a2 b1 b2
# 1 1 2 1 2
# 2 1 1 1 2
# 3 1 2 3 1
# 4 3 1 1 2
# 5 4 2 3 1
I would like to identify all rows of a where a1 is not in b1 or where a1 is in b1 but a2 for the special a1 is not in b2 for the special b2. So the second question is: When a1 is in b1 is then a2 for this row for a1 also in b2 for this row for b1.
Example for line 2: I am checking, if a1 = 1 is anywhere in b1 = c(1,1,3,1,3). It is, so I want to check if a2 = 1 in line 2 (where a1 = 1) is anywhere in b2 where b1 = a1 = 1, so here b2 = c(2, 2, 2). For line 2 a2 = 1 is not in b2 = c(2, 2, 2), so the result should show me this line.
The first question is easy to answer with the following code:
a[which(!(a$a1 %in% b$b1)), ]
# a1 a2
# 5 4 2
But I can't fix the second problem. Maybe I am working in a wrong way with the logical operators. My result should look like this:
a1 a2
2 1 1
4 4 2
Following the explanation in your edit, you want the rows where either the specific a1 from a is not in b1 from b or where the specific a1 from a is equal to b1 of the same row in b and a2 from a is not among the values of b2 from b of the rows for which b1 equals the value of the specific a1.
In R, you can write this like that:
cond <- sapply(seq(nrow(a)), # check each row, one by one
function (i){
!(a$a1[i] %in% b$b1) | # a1 of the specific row is not in b1 or
!(a$a2[i] %in% b$b2[b$b1==a$a1[i]]) # a2 of the specific row is not in the values of b2 for which b1 equals a1 of the sepcific row
})
a[cond, ]
# a1 a2
#2 1 1
#5 4 2
Obviously not a nice solution, but it works with my data (unequal dimension of rows of the two datasets, not the same position of the values in the variables) - here with new example data, because I chose the first really bad.
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,3,1,1), "b2" = c(2,1,2,2))
test <- function (data1, data2) {
for (i in unique(data1[data1$a1 %in% data2$b1, "a1"])) {
temp_data1 <- data1[data1$a1 == i, c("a1", "a2")]
temp_data2 <- data2[data2$b1 == i, c("b1", "b2")]
for (j in unique(temp_data1$a2)) {
test <- j %in% unique(temp_data2$b2)
if (test == FALSE) {
print(unique(temp_data1[temp_data1$a1 == i & temp_data1$a2 == j, ]))
}
}
}
for (k in unique(data1[which(!(data1$a1 %in% data2$b1)), "a1"])) {
print(unique(data1[data1$a1 == k, c("a1", "a2")]))
}
}
test(a, b)
a1 a2
2 1 1
a1 a2
5 4 2
Based on your answer I improved the function test(). This version returns a dataframe:
a <- data.frame(a1=c(1,1,1,3,4), a2=c(2,1,2,1,2))
b <- data.frame(b1=c(1,1,3,1,3), b2=c(2,2,1,2,1))
test <- function (a, b) {
R <- subset(a,!a1 %in% b$b1)
I <- unique(a$a1[a$a1 %in% b$b1])
for (i in I) {
ai <- subset(a, a1 == i)
bi <- subset(b, b1 == i)
J <- unique(bi$b2)
for (j in unique(ai$a2)) if (! j %in% J) R <- rbind(subset(ai, a2==j), R)
}
R
}
test(a, b)
I would like to achieve the following data.frame in R:
i1 i2 i3
1 A1 A2 A3
2 No A2 A3
3 A1 No A3
4 No No A3
5 A1 A2 No
6 No A2 No
7 A1 No No
8 No No No
In each column the variable can either be the concatenated string "A" and the column number or "No". The data.frame should contain all possible combinations.
My idea was to use expand.grid, but I don't know how to create the list dynamically. Or is there a better approach?
expand.grid(list(c("A1", "No"), c("A2", "No"), c("A3", "No")))
I guess you could create your own helper function, something like that
MyList <- function(n) expand.grid(lapply(paste0("A", seq_len(n)), c, "No"))
Then simply pass it the number of elements (e.g., 3)
MyList(3)
# Var1 Var2 Var3
# 1 A1 A2 A3
# 2 No A2 A3
# 3 A1 No A3
# 4 No No A3
# 5 A1 A2 No
# 6 No A2 No
# 7 A1 No No
# 8 No No No
Alternatively, you could also try data.tables CJ equivalent which should much more efficient than expand.grid for a big n
library(data.table)
DTCJ <- function(n) do.call(CJ, lapply(paste0("A", seq_len(n)), c, "No"))
DTCJ(3) # will return a sorted cross join
# V1 V2 V3
# 1: A1 A2 A3
# 2: A1 A2 No
# 3: A1 No A3
# 4: A1 No No
# 5: No A2 A3
# 6: No A2 No
# 7: No No A3
# 8: No No No
Another option is using Map with expand.grid
n <- 3
expand.grid(Map(c, paste0('A', seq_len(n)), 'NO'))
Or
expand.grid(as.data.frame(rbind(paste0('A', seq_len(n)),'NO')))
Another option, only using the most fundamental functions in R, is to use the indices:
df <- data.frame(V1 = c('A','A','A', 'A',rep('No',4)), V2 = c('A','A','No','No','A','A','No','No'), V3 = c('A','No','A','No','A','No','A','No'), stringsAsFactors = FALSE)
to get the row and col indices of the elements we need to change:
rindex <- which(df != 'No') %% nrow(df)
cindex <- ceiling(which(df != 'No')/nrow(df))
the solution is basically a one-liner:
df[matrix(c(rindex,cindex),ncol=2)] <- paste0(df[matrix(c(rindex,cindex),ncol=2)],cindex)
> df
V1 V2 V3
1 A1 A2 A3
2 A1 A2 No
3 A1 No A3
4 A1 No No
5 No A2 A3
6 No A2 No
7 No No A3
8 No No No
My data looks something like this:
DF<- data.frame( id=c("A1","A2","A3","A1"), submission=c(1,1,1,2))
What is the best way of keeping only the last submission for each id? That is:
DF<- data.frame( id=c("A2","A3","A1"), submission=c(1,1,2))
Thanks!
Here are a few options in base R:
DF[!duplicated(DF$id, fromLast=TRUE),]
# id submission
# 2 A2 1
# 3 A3 1
# 4 A1 2
do.call(rbind, by(DF, DF$id, FUN=tail, 1))
# id submission
# A1 A1 2
# A2 A2 1
# A3 A3 1
aggregate(submission ~ id, DF, tail, 1)
# id submission
# 1 A1 2
# 2 A2 1
# 3 A3 1