df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
I have a dataset with multiple questionnaires that each have multiple items. I would like to replace the missing data with the row mean of the observable values for each of the questionnaires (missing values in A items replaced by row mean of A1 to A3 and missing values in B items replaces by row mean of B1 to B3). What is the best way to do that?
You may try
df <- data.frame(A1 = c(6, 8, NA, 1, 5),
A2 = c(NA, NA, 9, 3, 6),
A3 = c(9, NA, 1, NA, 4),
B1 = c(NA, NA, 9, 3, 6),
B2 = c(9, NA, 1, NA, 4),
B3 = c(NA, NA, 9, 3, 6)
)
df1 <- df %>%
select(starts_with("A"))
df2 <- df %>%
select(starts_with("B"))
x1 <- which(is.na(df1), arr.ind = TRUE)
df1[x1] <- rowMeans(df1, na.rm = T)[x1[,1]]
x2 <- which(is.na(df2), arr.ind = TRUE)
df2[x2] <- rowMeans(df2, na.rm = T)[x2[,1]]
df <- cbind(df1, df2)
df
A1 A2 A3 B1 B2 B3
1 6 7.5 9 9 9 9
2 8 8.0 8 NaN NaN NaN
3 5 9.0 1 9 1 9
4 1 3.0 2 3 3 3
5 5 6.0 4 6 4 6
You may use split.default to split data in different groups and replace NA with row-wise mean (taken from this answer https://stackoverflow.com/a/6918323/3962914 )
as.data.frame(lapply(split.default(df, sub('\\d+', '', names(df))), function(x) {
k <- which(is.na(x), arr.ind = TRUE)
x[k] <- rowMeans(x, na.rm = TRUE)[k[, 1]]
x
})) -> result
names(result) <- names(df)
result
# A1 A2 A3 B1 B2 B3
#1 6 7.5 9 9 9 9
#2 8 8.0 8 NaN NaN NaN
#3 5 9.0 1 9 1 9
#4 1 3.0 2 3 3 3
#5 5 6.0 4 6 4 6
You could also do:
library(dplyr)
df %>%
reshape(names(.), dir='long', sep="")%>%
group_by(id) %>%
mutate(across(A:B, ~replace(.x, is.na(.x), mean(.x, na.rm = TRUE))))%>%
pivot_wider(id, names_from = time, values_from = A:B, names_sep = "") %>%
ungroup() %>%
select(-id)
# A tibble: 5 x 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
We can use split.default with na.aggregate
library(purrr)
library(zoo)
library(dplyr)
library(stringr)
map_dfc(split.default(df, str_remove(names(df), "\\d+")), ~
as_tibble(t(na.aggregate(t(.x)))))
# A tibble: 5 × 6
A1 A2 A3 B1 B2 B3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 6 7.5 9 9 9 9
2 8 8 8 NaN NaN NaN
3 5 9 1 9 1 9
4 1 3 2 3 3 3
5 5 6 4 6 4 6
Span a matrix of rowMeans on the rows and replace the NA's. In an lapply that greps the questions.
do.call(cbind, lapply(c('A', 'B'), function(q) {
s <- df[, grep(q, names(df))]
na <- is.na(s)
replace(s, na, rowMeans(s, na.rm=TRUE)[row(s)][na])
}))
# A1 A2 A3 B1 B2 B3
# 1 6 7.5 9 9 9 9
# 2 8 8.0 8 NaN NaN NaN
# 3 5 9.0 1 9 1 9
# 4 1 3.0 2 3 3 3
# 5 5 6.0 4 6 4 6
Data:
df <- structure(list(A1 = c(6, 8, NA, 1, 5), A2 = c(NA, NA, 9, 3, 6
), A3 = c(9, NA, 1, NA, 4), B1 = c(NA, NA, 9, 3, 6), B2 = c(9,
NA, 1, NA, 4), B3 = c(NA, NA, 9, 3, 6)), class = "data.frame", row.names = c(NA,
-5L))
I have following problem that you can easily see after downloading the picture. It would be of great help if you help me solve the problem.
In Table 1, the IDs are correctly linked up with the values in column A. B contains some values which are not ordered and whose corresponding IDs are not given in corresponding rows. We need to find the IDs of the values in column B by using the IDs of column A.
Now if we run following code in R, we will find the IDs corresponding the values in column B
mydata <- read.csv(‘C:/Users/Windows/Desktop/practice_1.csv’)
df <- data.frame(mydata$B, mydata$A, mydata$ID, header=TRUE)
library(qdap)
df[, "New ID"] <- df[, 1] %l% df[, -1]
After running above code, we will find the new ID in the column New ID like Table 2.
What you need is a simple match operation:
table1$ID2 <- table1$ID1[match(table1$z, table1$y)]
table1
# ID1 y z ID2
# 1 0 1 11 10
# 2 1 2 3 2
# 3 2 3 5 4
# 4 3 4 4 3
# 5 4 5 8 7
# 6 5 6 7 6
# 7 6 7 15 15
# 8 7 8 6 5
# 9 8 9 2 1
# 10 9 10 16 17
# 11 10 11 1 0
# 12 11 12 NA NA
# 13 15 15 NA NA
# 14 17 16 NA NA
Please, the next time you ask a question where sample data is necessary (most questions), please provide data in this format:
Data
# dput(table1)
structure(list(ID1 = c(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 15, 17), y = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 16), z = c(11, 3, 5, 4, 8, 7, 15, 6, 2, 16, 1, NA, NA, NA), ID2 = c(10, 2, 4, 3, 7, 6, 15, 5, 1, 17, 0, NA, NA, NA)), row.names = c(NA, -14L), class = "data.frame")
I don't know exactly how to explain it but...
I have a sparse table where each group represents a level. The columns are ordered, it means, the downstream (left) column represents a child node and upstream (right) node represents a parent node.
I'd like a two columns table where the 1st column is the parent node and the 2nd is the child node. If possible, a 3rd columns with the length (sum of the number of final nodes) of the parents.
Follow the example:
>tt <- tibble(
ID = letters[1:8],
`1` = c( 1, 1, 1, 1, 2, 2, 2, 2),
`2` = c( 3, 3, 4, 4, 5, 5, 5, 6),
`3` = c( 7, 7, 8, 9,10,10,11,12)
)
> tt
# A tibble: 8 x 4
ID `1` `2` `3`
<chr> <dbl> <dbl> <dbl>
1 a 1 3 7
2 b 1 3 7
3 c 1 4 8
4 d 1 4 9
5 e 2 5 10
6 f 2 5 10
7 g 2 5 11
8 h 2 6 12
>dput(tt)
structure(list(ID = c("a", "b", "c", "d", "e", "f", "g", "h"),
`1` = c(1, 1, 1, 1, 2, 2, 2, 2), `2` = c(3, 3, 4, 4, 5, 5,
5, 6), `3` = c(7, 7, 8, 9, 10, 10, 11, 12)), row.names = c(NA,
-8L), class = c("tbl_df", "tbl", "data.frame"))
the result should be:
>ttt <- tibble(
parent = c(1,1,2,2,3,4,4, 5, 5, 6, 7,7,8,9,10,10,11,12),
child = c(3,4,5,6,7,8,9,10,11,12, letters[1:8] ),
length = c(4,4,4,4,2,2,2, 3, 3, 1, 2,2,1,1, 2, 2, 1, 1)
)
>ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
> dput(ttt)
structure(list(parent = c(1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7,
8, 9, 10, 10, 11, 12), child = c("3", "4", "5", "6", "7", "8",
"9", "10", "11", "12", "a", "b", "c", "d", "e", "f", "g", "h"
), length = c(4, 4, 4, 4, 2, 2, 2, 3, 3, 1, 2, 2, 1, 1, 2, 2,
1, 1)), row.names = c(NA, -18L), class = c("tbl_df", "tbl", "data.frame"
))
Any help is appreciated.
Thanks in advance.
This gets you 90% of the way there:
tt_correct <- tt[, c(2,3,4,1)]
ttt <- do.call(
rbind,
lapply(seq_len(length(tt)-1),
function(i){
DF <- tt_correct[, c(i, i+1)]
names(DF) <- c('parent', 'child')
DF$length <- ave(DF$parent, DF$parent, FUN = length)
unique(DF)
}
)
)
ttt
# A tibble: 18 x 3
parent child length
<dbl> <chr> <dbl>
1 1 3 4
2 1 4 4
3 2 5 4
4 2 6 4
5 3 7 2
6 4 8 2
7 4 9 2
8 5 10 3
9 5 11 3
10 6 12 1
11 7 a 2
12 7 b 2
13 8 c 1
14 9 d 1
15 10 e 2
16 10 f 2
17 11 g 1
18 12 h 1
The first part is correcting the order. Your expected output indicates that the 1st column is a child of the 4th column. The lapply() statement largely walks along the data.frame and stacks the data.
This is 90% of the way because the answer doesn't agree with your expected output for lengths. I think this is correct but I could be wrong.
Finally, and I'm not that good with igraph, you could likely find additional information doing:
library(igraph)
plot(graph_from_data_frame(ttt[, 1:2]))
Hi everybody I am working with a list of data frames in R. Lists are awesome in R but I want to solve this. I have a list named global that has five data frames f1,f2,f3,f4,f5 each data frame has a principal variable named CreditValue and variables that works like flags for example f1 has CreditValue and a flag variable b1 with values of 1. f2 has two flag variables b1 with values of 1 and b2 with values of 2. f3 hast three flag variables b1 with values of 1, b2 with values of 2 and b3 with values of 3. f4 has four flag variables b1 with values of 1, b2 with values of 2 ,b3 with values of 3 and b4 with values of 4. f5 has five flag variables b1 with values of 1, b2 with values of 2 ,b3 with values of 3, b4 with values of 4 and b5 with values of 5. Flag variables always start in column 3 for all data frames. I wish to compute the sum of CreditValue in each data frame considering different aspects over flag variables. My list has the next structure (I include dput version in the final part):
global
$f1
KeyID CreditValue b1
1 001 1 1
2 002 2 1
3 003 3 1
4 004 4 1
5 005 5 1
6 006 6 1
7 007 7 1
8 009 8 1
9 010 9 1
$f2
KeyID CreditValue b1 b2
1 001 1 1 2
2 002 2 1 2
3 003 3 NA 2
4 004 4 NA 2
5 005 5 NA 2
6 006 6 1 2
7 007 7 1 2
8 009 8 NA 2
9 010 9 1 2
10 011 10 NA 2
11 012 11 1 2
$f3
KeyID CreditValue b1 b2 b3
1 001 1 1 2 3
2 002 2 1 2 3
3 003 3 1 2 3
4 004 4 1 2 3
5 005 5 NA 2 3
6 006 6 NA 2 3
7 007 7 1 2 3
8 009 8 1 2 3
9 010 9 NA NA 3
10 011 10 NA NA 3
11 012 11 NA 2 3
12 013 11 1 2 3
13 014 11 NA NA 3
$f4
KeyID CreditValue b1 b2 b3 b4
1 001 1 NA 2 3 4
2 002 2 NA 2 3 4
3 003 3 NA NA NA 4
4 004 4 NA NA NA 4
5 005 5 NA NA NA 4
6 006 6 1 2 3 4
7 007 7 1 2 3 4
8 009 8 1 2 3 4
9 010 9 1 2 3 4
10 011 10 1 2 3 4
11 012 11 1 2 3 4
12 013 11 1 2 3 4
13 014 11 1 2 3 4
14 015 12 1 NA 3 4
15 016 12 1 NA 3 4
$f5
KeyID CreditValue b1 b2 b3 b4 b5
1 001 1 1 2 3 4 5
2 002 2 1 2 3 4 5
3 003 3 1 2 3 4 5
4 004 4 1 2 3 4 5
5 005 5 NA NA 3 4 5
6 006 6 1 2 3 4 5
7 007 7 1 2 3 4 5
8 009 8 1 2 3 4 5
9 010 9 1 2 3 4 5
10 011 10 NA NA NA NA 5
11 012 11 1 2 3 4 5
12 013 11 1 2 3 4 5
13 014 11 1 2 3 4 5
14 015 12 1 2 3 4 5
15 016 12 1 2 3 4 5
16 017 14 NA NA NA 4 5
17 018 14 NA NA NA 4 5
I have used llply() function form plyr package to work with lists in R but I don't know how to define a function to make this. I compute the sums using this code but if I had more data frames it would be so complex. Also I would like to save this values in a new data frame or matrix considering flag variables (5). The results of the sums are the next:
sum(f1$CreditValue[f1[,3]==1])
[1] 45
sum(f2$CreditValue[f2[,3]==1],na.rm=TRUE)
[1] 36
sum(f3$CreditValue[f3[,3]==1],na.rm=TRUE)
[1] 36
sum(f4$CreditValue[f4[,3]==1],na.rm=TRUE)
[1] 97
sum(f5$CreditValue[f5[,3]==1],na.rm=TRUE)
[1] 97
These sums are computed applying those formulas considering b1 variable in all data frames.
sum(f2$CreditValue[is.na(f2[,3]) & f2[,4]==2] ,na.rm=TRUE)
[1] 30
sum(f3$CreditValue[is.na(f3[,3]) & f3[,4]==2] ,na.rm=TRUE)
[1] 22
sum(f4$CreditValue[is.na(f4[,3]) & f4[,4]==2] ,na.rm=TRUE)
[1] 3
sum(f5$CreditValue[is.na(f5[,3]) & f5[,4]==2] ,na.rm=TRUE)
[1] 0
These sums are computed applying those formulas considering values of b2 and b1 variables in all data frames. Here there is a condition over values of b1 (column 3).
sum(f3$CreditValue[is.na(f3[,3]) & is.na(f3[,4]) & f3[,5]==3] ,na.rm=TRUE)
[1] 30
sum(f4$CreditValue[is.na(f4[,3]) & is.na(f4[,4]) & f4[,5]==3] ,na.rm=TRUE)
[1] 0
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & f5[,5]==3] ,na.rm=TRUE)
[1] 5
These sums are computed applying those formulas considering values of b3, b2 and b1 variables in all data frames. Now there is a condition over values of b1 and b2 (columns 3, 4).
sum(f4$CreditValue[is.na(f4[,3]) & is.na(f4[,4]) & is.na(f4[,5]) & f4[,6]==4] ,na.rm=TRUE)
[1] 12
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & is.na(f5[,5]) & f5[,6]==4] ,na.rm=TRUE)
[1] 28
These sums are computed applying those formulas considering values of b4, b3, b2 and b1 variables in all data frames. Now there is a condition over values of b1, b2 and b3 (columns 3, 4, 5).
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & is.na(f5[,5]) & is.na(f5[,6]) & f5[,7]==5] ,na.rm=TRUE)
[1] 10
This sum is computed applying last formula considering values of b5, b4, b3, b2 and b1 variables in all data frames. Now there is a condition over values of b1, b2, b3 and b4 (columns 3, 4, 5, 6).
The showed sum are the result of a lot of code but I would like to create a function that works over flag variables (b1, b2, b3, b4, b5) to compute the sums. I don't know if it is possible to make this with a for or a function that works with llply or lapply. I have tried to resume code like this:
sum(f5$CreditValue[is.na(f5[,3]) & is.na(f5[,4]) & is.na(f5[,5]) & is.na(f5[,6]) & f5[,7]==5] ,na.rm=TRUE)
With this code:
sum(f5$CreditValue[is.na(f5[,3,4,5,6]) & f5[,7]==5] ,na.rm=TRUE)
But it doesn't job because with the original conditions I am considering only specific rows in each data frame and the resumed code doesn't make this. I would like to save the results of sums in a new data frame, matrix like this:
f1 f2 f3 f4 f5
f1 45 0 0 0 0
f2 36 30 0 0 0
f3 36 22 30 0 0
f4 97 3 0 12 0
f5 97 0 5 28 10
The zeros in the last data frame are produced due to all data frames don't have all flag variables for example f1 only has b1 and it doesn't have b2,b3,b4,b5 like f5. The dput version of my list is the next:
structure(list(f1 = structure(list(KeyID = c("001", "002", "003",
"004", "005", "006", "007", "009", "010"), CreditValue = c(1,
2, 3, 4, 5, 6, 7, 8, 9), b1 = c(1, 1, 1, 1, 1, 1, 1, 1, 1)), .Names = c("KeyID",
"CreditValue", "b1"), row.names = c(NA, 9L), class = "data.frame"),
f2 = structure(list(KeyID = c("001", "002", "003", "004",
"005", "006", "007", "009", "010", "011", "012"), CreditValue = c(1,
2, 3, 4, 5, 6, 7, 8, 9, 10, 11), b1 = c(1, 1, NA, NA, NA,
1, 1, NA, 1, NA, 1), b2 = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2)), .Names = c("KeyID", "CreditValue", "b1", "b2"), row.names = c(NA,
11L), class = "data.frame"), f3 = structure(list(KeyID = c("001",
"002", "003", "004", "005", "006", "007", "009", "010", "011",
"012", "013", "014"), CreditValue = c(1, 2, 3, 4, 5, 6, 7,
8, 9, 10, 11, 11, 11), b1 = c(1, 1, 1, 1, NA, NA, 1, 1, NA,
NA, NA, 1, NA), b2 = c(2, 2, 2, 2, 2, 2, 2, 2, NA, NA, 2,
2, NA), b3 = c(3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3)), .Names = c("KeyID",
"CreditValue", "b1", "b2", "b3"), row.names = c(NA, 13L), class = "data.frame"),
f4 = structure(list(KeyID = c("001", "002", "003", "004",
"005", "006", "007", "009", "010", "011", "012", "013", "014",
"015", "016"), CreditValue = c(1, 2, 3, 4, 5, 6, 7, 8, 9,
10, 11, 11, 11, 12, 12), b1 = c(NA, NA, NA, NA, NA, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1), b2 = c(2, 2, NA, NA, NA, 2, 2, 2,
2, 2, 2, 2, 2, NA, NA), b3 = c(3, 3, NA, NA, NA, 3, 3, 3,
3, 3, 3, 3, 3, 3, 3), b4 = c(4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4)), .Names = c("KeyID", "CreditValue", "b1",
"b2", "b3", "b4"), row.names = c(NA, 15L), class = "data.frame"),
f5 = structure(list(KeyID = c("001", "002", "003", "004",
"005", "006", "007", "009", "010", "011", "012", "013", "014",
"015", "016", "017", "018"), CreditValue = c(1, 2, 3, 4,
5, 6, 7, 8, 9, 10, 11, 11, 11, 12, 12, 14, 14), b1 = c(1,
1, 1, 1, NA, 1, 1, 1, 1, NA, 1, 1, 1, 1, 1, NA, NA), b2 = c(2,
2, 2, 2, NA, 2, 2, 2, 2, NA, 2, 2, 2, 2, 2, NA, NA), b3 = c(3,
3, 3, 3, 3, 3, 3, 3, 3, NA, 3, 3, 3, 3, 3, NA, NA), b4 = c(4,
4, 4, 4, 4, 4, 4, 4, 4, NA, 4, 4, 4, 4, 4, 4, 4), b5 = c(5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5)), .Names = c("KeyID",
"CreditValue", "b1", "b2", "b3", "b4", "b5"), row.names = c(NA,
17L), class = "data.frame")), .Names = c("f1", "f2", "f3",
"f4", "f5"))
I hope you can help me it is so complex for me building a function to compute the sums and If I use traditional forms of code I would have problems with lists of more data frames. Thanks for your help.
You can use lapply and call a function that builds the rows of your output data frame:
get.sums = function(df) {
sapply(1:5, function(y) {
if (y > 1) {
na.col = 3:(y+1)
} else {
na.col = NULL
}
if (paste0("b", y) %in% names(df)) {
return(sum(df$CreditValue[rowSums(!is.na(df[,na.col,drop=F])) == 0 & df[,(y+2)] == y], na.rm=T))
} else {
return(0)
}
})
}
rows = lapply(global, get.sums)
sums = do.call(rbind, rows)
sums
# [,1] [,2] [,3] [,4] [,5]
# f1 45 0 0 0 0
# f2 36 30 0 0 0
# f3 36 22 30 0 0
# f4 97 3 0 12 0
# f5 97 0 5 28 10