I'm trying to recode a SAS array variable based on other variables condition. I am getting array subscript out of range error
datasets Chk1 and Chk2 have the following data
Chk1
id Var1 Var2
1 Y N
2 N Y
3 Y Y
Chk2
id Var3 Var4
1 N Y
2 Y N
3 Y Y
My desired output is
Chk3
id Var1 Var2 Var3 Var4 a1 a2 a3 b1 b2 b3
1 Y N N Y 1 1
2 N Y Y N 1 1
3 Y Y Y Y 1 1
Here is the data merge with recoding variables
I need your help in tweaking do loop with incremental value.
data Chk3;
merge Chk1(in=a) Chk2(in=b);
by id;
if a and b;
array wv(*) var1 var2;
array wv2(*) var3 var4;
array wv3(*) a1 a2 a3 b1 b2 b3;
do i=1 to dim(wv3) by 3;
if wv(i)='Y' and wv2(i)='N' then wv3(i)=1;
if wv(i)='N' and wv2(i)='Y' then wv3(i+1)=1;
if wv(i)=wv2(i) then wv3(i+2)=1;
end;
run;
You are flagging an assertion of a bitwise combinatoric. The BOR and BLSHIFT bitwise functions will aid you greatly.
data have1; input
id Var1 $ Var2 $; datalines;
1 Y N
2 N Y
3 Y Y
data have2; input
id Var3 $ Var4 $; datalines;
1 N Y
2 Y N
3 Y Y
run;
data want;
merge have1 have2;
by id;
array have1bitCombo a1-a3;
array have2bitCombo b1-b3;
have1bitCombo[bor(var1='Y', blshift(var2='Y',1))] = 1;
have2bitCombo[bor(var3='Y', blshift(var4='Y',1))] = 1;
run;
below should work:
data Chk3;
merge Chk1(in=a) Chk2(in=b);
by id;
if a and b;
array wv(*) var1 var2;
array wv2(*) var3 var4;
array wv3(*) a1 a2 a3 b1 b2 b3;
do i=1 to dim(wv2);
if wv(i)='Y' and wv2(i)='N' then
wv3((i-1)*3+1)=1;
if wv(i)='N' and wv2(i)='Y' then
wv3((i-1)*3+2)=1;
if wv(i)=wv2(i) then
wv3((i-1)*3+3)=1;
end;
run;
Obs id var1 var2 var3 var4 a1 a2 a3 b1 b2 b3 i
1 1 Y N N Y 1 . . . 1 . 3
2 2 N Y Y N . 1 . 1 . . 3
3 3 Y Y Y Y . . 1 . . 1 3
Related
I'm trying to cbind or unnest or as.data.table a partially nested list.
id <- c(1,2)
A <- c("A1","A2","A3")
B <- c("B1")
AB <- list(A=A,B=B)
ABAB <- list(AB,AB)
nested_list <- list(id=id,ABAB=ABAB)
The length of id is the same as ABAB (2 in this case). I don't know how to unlist a part of this list (ABAB) and cbind another part (id). Here's my desired result as a data.table:
data.table(id=c(1,1,1,2,2,2),A=c("A1","A2","A3","A1","A2","A3"),B=rep("B1",6))
id A B
1: 1 A1 B1
2: 1 A2 B1
3: 1 A3 B1
4: 2 A1 B1
5: 2 A2 B1
6: 2 A3 B1
I haven't tested for more general cases, but this works for the OP example:
library(data.table)
as.data.table(nested_list)[, lapply(ABAB, as.data.table)[[1]], id]
# id A B
#1: 1 A1 B1
#2: 1 A2 B1
#3: 1 A3 B1
#4: 2 A1 B1
#5: 2 A2 B1
#6: 2 A3 B1
Or another option (which is probably faster, but is more verbose):
rbindlist(lapply(nested_list$ABAB, as.data.table),
idcol = 'id')[, id := nested_list$id[id]]
This is some super ugly base R, but produces the desired output.
Reduce(rbind, Map(function(x, y) setNames(data.frame(x, y), c("id", "A", "B")),
as.list(nested_list[[1]]),
lapply(unlist(nested_list[-1], recursive=FALSE),
function(x) Reduce(cbind, x))))
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
lapply takes the a list of two elements (each containing the A and B variables) extracted with unlist and recursive=FALSE. It returns a list of character matrices with the B variable filled in by recycling. A list of the individual id variables from as.list(nested_list[[1]]) and the lit of matrices are fed to Map which converts corresponding pairs to a data.frame and gives the columns the desired names and returns a list of data.frames. Finally, this list of data.frames is fed to Reduce, which rbinds the results to a single data.frame.
The final Reduce(rbind, could be replaced by data.tables rbindlist if desired.
Here's another hideous solution
max_length = max(unlist(lapply(nested_list, function(x) lapply(x, lengths))))
data.frame(id = do.call(c, lapply(nested_list$id, rep, max_length)),
do.call(rbind, lapply(nested_list$ABAB, function(x)
do.call(cbind, lapply(x, function(y) {
if(length(y) < max_length) {
rep(y, max_length)
} else {
y
}
})))))
# id A B
#1 1 A1 B1
#2 1 A2 B1
#3 1 A3 B1
#4 2 A1 B1
#5 2 A2 B1
#6 2 A3 B1
And one more, also inelegant- but I`d gone too far by the time I saw the other answers.
restructure <- function(nested_l) {
ids <- as.numeric(max(unlist(lapply(unlist(nested_l, recursive = FALSE), function(x){
lapply(x, length)
}))))
temp = data.frame(rep(nested_l$id, each = ids),
sapply(1:length(nested_l$id), function(x){
out <-unlist(lapply(nested_l[[2]], function(y){
return(y[x])
}))
}))
names(temp) <- c("id", unique(substring(unlist(nested_l[2]), first = 1, last = 1)))
return(temp)
}
> restructure(nested_list)
id A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
Joining the party:
library(tidyverse)
temp <- map(nested_list,~map(.x,~expand.grid(.x)))
df <- map_df(1:2,~cbind(temp$id[[.x]],temp$ABAB[[.x]]))
Var1 A B
1 1 A1 B1
2 1 A2 B1
3 1 A3 B1
4 2 A1 B1
5 2 A2 B1
6 2 A3 B1
This is a bit odd to explain. I have two dataframes. The first is a reference, the second may or may not have missing values. Both dataframes share the same variables. If there is an NA in the second dataframe I want to create a new dataframe based on the reference values but remove that variable that's missing. Each odd and even variables are associated. So if an odd variable has an NA value, that variable + the next variable needs to be removed in the new dataframe. If there is an even variable with an NA value that variable + the previous variable needs to be removed. So var 1 and var 2 are related, var 3 and var 4, var 5 and var 6, etc. More or less I have it working, but my code is having issues with NULL values.
is.odd <- function(x) x %% 2 == 1
apply(new, 1, function(x) {
if(is.odd(which(is.na(x)))) {
toremove=c(which(is.na(x)), which(is.na(x))+1)
}
if(!is.odd(which(is.na(x)))) {
toremove=c(which(is.na(x)), which(is.na(x))-1)
}
ref[,!(1:ncol(ref) %in% toremove), drop=F]
})
The error I am getting is "Error in if (is.odd(which(is.na(x)))) { : argument is a length of zero".
Here is an example of what I want. The first is the reference dataframe:
var1 var2 var3 var4
1 q w e r
2 t y u i
3 o p a s
4 d f g h
The second has missing NA values:
var1 var2 var3 var4
1 1 1 1 1
2 1 3 2 NA
3 1 NA a s
4 d f g h
Each row of the second dataframe is used to create the new dataframe based on reference values. For the first row the output should be complete since there are no NAs:
var1 var2 var3 var4
1 q w e r
2 t y u i
3 o p a s
4 d f g h
For the second row the output should be missing the 3rd and 4th variable since the 4th had an NA and 3 + 4 (odd + even) are related. :
var1 var2
1 q w
2 t y
3 o p
4 d f
Update: I think the problem is with which(is.na(x) returning numeric(0). I'm attempting to control for that with:
if(!identical(which(is.na(x))), numeric(0)) {}
But it doesn't appear to be matching.
We can try
apply(df2, 1, function(x) {if(all(!is.na(x)))
df1 else {
i1 <- which(is.na(x))
df1[-c(sapply(i1, function(i)
if(!i%%2) c(max(1, i-1), i)
else c(i, min(length(x), i+1))))]
}
}
)
#$`1`
# var1 var2 var3 var4
#1 q w e r
#2 t y u i
#3 o p a s
#4 d f g h
#$`2`
# var1 var2
#1 q w
#2 t y
#3 o p
#4 d f
#$`3`
# var3 var4
#1 e r
#2 u i
#3 a s
#4 g h
#$`4`
# var1 var2 var3 var4
#1 q w e r
#2 t y u i
#3 o p a s
#4 d f g h
I have two data frames. One is considered a reference and has every value, the other may or may not be missing values. I want to compare both data frames, then delete the values from the reference data frame that have NA in the other. However, each row of the data frame that can have missing values needs to be treated as a single comparison so you are developing a unique reference for every single row. For example the reference dataframe(1):
var1 var2 var3
1 a b c
2 q w e
3 z x n
The other data frame(2):
var1 var2 var3
1 p o i
2 u y t
3 NA e w
4 l k NA
For row 1 and 2 of data frame 2 I need a full reference since no missing values:
var1 var2 var3
1 a b c
2 q w e
3 z x n
For row 3 of data frame 2 I need:
var2 var3
1 b c
2 w e
3 x n
For row 4 of data frame 2 I need:
var1 var2
1 a b
2 q w
3 z x
Try:
> ref<-data.frame(var1=c('a','q','z'),var2=c('b','w','x'),var3=c('c','e','n'))
> new<-data.frame(var1=c('p','u',NA,'l'),var2=c('o','y','e','k'),var3=c('i','t','w',NA))
> apply(new,1,function(x) ref[,which(!is.na(x))] )
[[1]]
var1 var2 var3
1 a b c
2 q w e
3 z x n
[[2]]
var1 var2 var3
1 a b c
2 q w e
3 z x n
[[3]]
var2 var3
1 b c
2 w e
3 x n
[[4]]
var1 var2
1 a b
2 q w
3 z x
As requested in author's comment, if he also wants to:
"remove +1 index if the NA is at an odd index, and -1 index if the NA is at an even index".
is.odd <- function(x) x %% 2 == 1
apply(new, 1, function(x) {
toremove <-which(is.na(x))
toremove1<-sapply(toremove,function(x) ifelse(is.odd(x),x+1,x-1) )
ref[,!(1:ncol(ref) %in% c(toremove,toremove1)),drop=F]
})
I would like to compare two columns simultaneously. My data looks like this:
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,1,3,1,3), "b2" = c(2,2,1,2,1))
cbind(a, b)
# a1 a2 b1 b2
# 1 1 2 1 2
# 2 1 1 1 2
# 3 1 2 3 1
# 4 3 1 1 2
# 5 4 2 3 1
I would like to identify all rows of a where a1 is not in b1 or where a1 is in b1 but a2 for the special a1 is not in b2 for the special b2. So the second question is: When a1 is in b1 is then a2 for this row for a1 also in b2 for this row for b1.
Example for line 2: I am checking, if a1 = 1 is anywhere in b1 = c(1,1,3,1,3). It is, so I want to check if a2 = 1 in line 2 (where a1 = 1) is anywhere in b2 where b1 = a1 = 1, so here b2 = c(2, 2, 2). For line 2 a2 = 1 is not in b2 = c(2, 2, 2), so the result should show me this line.
The first question is easy to answer with the following code:
a[which(!(a$a1 %in% b$b1)), ]
# a1 a2
# 5 4 2
But I can't fix the second problem. Maybe I am working in a wrong way with the logical operators. My result should look like this:
a1 a2
2 1 1
4 4 2
Following the explanation in your edit, you want the rows where either the specific a1 from a is not in b1 from b or where the specific a1 from a is equal to b1 of the same row in b and a2 from a is not among the values of b2 from b of the rows for which b1 equals the value of the specific a1.
In R, you can write this like that:
cond <- sapply(seq(nrow(a)), # check each row, one by one
function (i){
!(a$a1[i] %in% b$b1) | # a1 of the specific row is not in b1 or
!(a$a2[i] %in% b$b2[b$b1==a$a1[i]]) # a2 of the specific row is not in the values of b2 for which b1 equals a1 of the sepcific row
})
a[cond, ]
# a1 a2
#2 1 1
#5 4 2
Obviously not a nice solution, but it works with my data (unequal dimension of rows of the two datasets, not the same position of the values in the variables) - here with new example data, because I chose the first really bad.
a <- data.frame("a1" = c(1,1,1,3,4), "a2" = c(2,1,2,1,2))
b <- data.frame("b1" = c(1,3,1,1), "b2" = c(2,1,2,2))
test <- function (data1, data2) {
for (i in unique(data1[data1$a1 %in% data2$b1, "a1"])) {
temp_data1 <- data1[data1$a1 == i, c("a1", "a2")]
temp_data2 <- data2[data2$b1 == i, c("b1", "b2")]
for (j in unique(temp_data1$a2)) {
test <- j %in% unique(temp_data2$b2)
if (test == FALSE) {
print(unique(temp_data1[temp_data1$a1 == i & temp_data1$a2 == j, ]))
}
}
}
for (k in unique(data1[which(!(data1$a1 %in% data2$b1)), "a1"])) {
print(unique(data1[data1$a1 == k, c("a1", "a2")]))
}
}
test(a, b)
a1 a2
2 1 1
a1 a2
5 4 2
Based on your answer I improved the function test(). This version returns a dataframe:
a <- data.frame(a1=c(1,1,1,3,4), a2=c(2,1,2,1,2))
b <- data.frame(b1=c(1,1,3,1,3), b2=c(2,2,1,2,1))
test <- function (a, b) {
R <- subset(a,!a1 %in% b$b1)
I <- unique(a$a1[a$a1 %in% b$b1])
for (i in I) {
ai <- subset(a, a1 == i)
bi <- subset(b, b1 == i)
J <- unique(bi$b2)
for (j in unique(ai$a2)) if (! j %in% J) R <- rbind(subset(ai, a2==j), R)
}
R
}
test(a, b)
I am stuck on this problem and would be happy for advice. I have the following data.frame:
c1 <- factor(c("a","a","a","a"))
c2 <- factor(c("b","b","y","b"))
c3 <- factor(c("c","y","z","c"))
c4 <- factor(c("y","z","","y"))
c5 <- factor(c("z","","","z"))
x <- data.frame(c1,c2,c3,c4,c5)
So this data looks like this:
c1 c2 c3 c4 c5
1 a b c y z
2 a b y z
3 a y z
4 a b c y z
So in each row, there is a sequence of varying length of a, b, c which concludes with values for y and z. What I need to do is to move values y and z each to separate column that I can work with, so the data looks like this:
c6 c7 c8 c9 c10
1 a b c y z
2 a b y z
3 a y z
4 a b c y z
I have worked out to identify the length of each sequence per row and added that as a column, so I know which column y and z is located in:
x$not.na <- apply(paths, 1, function(x) length(which(!x=="")))
But I am stuck on how to loop(?) over each row to perform the necessary cut and paste of z and y.
Something like this:
lastTwoToEnd<-function(x){
i<-sum(x!="")-1:0
x[c(setdiff(seq_along(x),i),i)]
}
data.frame(t(apply(x,1,lastTwoToEnd)))
## X1 X2 X3 X4 X5
## 1 a b c y z
## 2 a b y z
## 3 a y z
## 4 a b c y z