Copy rows of dataframes in R - r

I have a dataframe:
df <- data.frame(id = c('1','2','3'), b = c('b1', '', 'b3'), c = c('c1', 'c2', ''), d = c('d1', '', ''))
id b c d
1 b1 c1 d1
2 c2
3 b3
where the row with id-1 is filled with all data with no empty column values. I want to copy all cell values from id-1 into id 2 and 3 if there are missing values in those cells from rows 2 & 3. Final output something like:
df2 <- data.frame(id = c('1','2','3'), b = c('b1', 'b1', 'b3'), c = c('c1', 'c2', 'c1'), d = c('d1', 'd1', 'd1'))
id b c d
1 b1 c1 d1
2 b1 c2 d1
3 b3 c1 d1
Thank you for your help in advance

Use some matrix indexing to get the "" cases and then overwrite selecting the appropriate column from the first row of df:
idx <- which(df[-1]=="", arr.ind=TRUE)
df[-1][idx] <- unlist(df[1,-1][idx[,"col"]])
# id b c d
#1 1 b1 c1 d1
#2 2 b1 c2 d1
#3 3 b3 c1 d1

Related

Convert nested lists to data.frame using all recursive indexes as colnames and fill missing columns with NAs

I have been struggling with this for a day now and all research made in SO doesn't seem to produce the result that i need.
I have this list:
input_list <- list(
list(A = 'a1', B = 'b1', C = 'c1', D = 'd1'),
list(A = 'a2', C = 'c2', D = 'd2'),
list(A = 'a3', B = 'b3', C = 'c3'),
list(A = 'a4', B = 'b4', C = 'c4',
D = list(
sub_1 = "d4_1",
sub_2 = "d4_2")
)
)
Basically I want to turn it into this structure:
#tbl_df
#A B C D D.sub_1 D_sub_2
#a1 b1 c1 d1 NA NA
#a2 NA c2 d2 NA NA
#a3 b3 c3 NA NA NA
#a4 b4 c4 NA d4_1 d4_2
I tried messing with map function:
output_list <- input_list %>%
map(unlist) %>%
do.call(rbind.data.frame, .)
It correctly unlists all nested lists converting them to named vectors, but I'm stuck as to how to rbind the rows matching the column names and fill missing variables with NAs.
Any help appreciated.
Maybe try this. You can use unlist() with lapply() to unnest the values and then transform to dataframe each element using as.data.frame(t(...)). Finally, bind_rows() from dplyr can bind the elements as you expect. Here the code:
library(dplyr)
#Code
newdf <- bind_rows(lapply(input_list, function(x) as.data.frame(t(unlist(x)))))
Output:
A B C D D.sub_1 D.sub_2
1 a1 b1 c1 d1 <NA> <NA>
2 a2 <NA> c2 d2 <NA> <NA>
3 a3 b3 c3 <NA> <NA> <NA>
4 a4 b4 c4 <NA> d4_1 d4_2
You can use map_dfr :
purrr::map_dfr(input_list, as.data.frame)
# A B C D D.sub_1 D.sub_2
#1 a1 b1 c1 d1 <NA> <NA>
#2 a2 <NA> c2 d2 <NA> <NA>
#3 a3 b3 c3 <NA> <NA> <NA>
#4 a4 b4 c4 <NA> d4_1 d4_2
We can use unnest_wider
library(purrr)
library(dplyr)
tibble(col = input_list) %>%
unnest_wider(c(col)) %>%
unnest_wider(c(D))

Convert nested list to data.frame

I got a nested list l with each item each self is a 2 level list. For example:
l1 = list("a", list("a1"= "a1v"))
l2 = list("b", list("b1" = "b1v", b2 = "b2v"))
l3 = list("c", list("c1" = c("c1v1", "c1v2", "c1v3")))
l = list(l1, l2, l3)
How do I tranform it to a data.frame like this:
df = data.frame(A = c("a", "b", "b", "c", "c", "c"), B= c("a1", "b1", "b2", "c1", "c1", "c1"), C=c("a1v", "b1v", "b2v", "c1v1", "c1v2", "c1v3"))
> df
A B C
1 a a1 a1v
2 b b1 b1v
3 b b2 b2v
4 c c1 c1v1
5 c c1 c1v2
6 c c1 c1v3
Tried with seperate_rows and map_df but both failed to deal with inconsistent dimension of .x[[2]] items.
Update 1:
#akrun's solution is not running for me:
We could use bind_rows with map
library(purrr)
library(dplyr)
library(tidyr)
map_dfr(l, ~bind_cols(.x) %>%
pivot_longer(cols = -1, names_to = 'B', values_to = 'C') %>%
rename_at(1, ~'A'))
# A tibble: 6 x 3
# A B C
#* <chr> <chr> <chr>
#1 a a1 a1v
#2 b b1 b1v
#3 b b2 b2v
#4 c c1 c1v1
#5 c c1 c1v2
#6 c c1 c1v3
If the sample data in your question accurately reflects your actual data, you can try one of the following:
library(data.table)
data.table(l)[, list(names(unlist(l)),
unlist(l, use.names = FALSE))][
, V3 := V2[1], cumsum(V1 == "")][V1 != ""]
## V1 V2 V3
## 1: a1 a1v a
## 2: b1 b1v b
## 3: b2 b2v b
## 4: c11 c1v1 c
## 5: c12 c1v2 c
## 6: c13 c1v3 c
reshape2::melt(setNames(lapply(l, "[[", -1), lapply(l, "[[", 1)))
## value L2 L1
## 1 a1v a1 a
## 2 b1v b1 b
## 3 b2v b2 b
## 4 c1v1 c1 c
## 5 c1v2 c1 c
## 6 c1v3 c1 c
Base R option :
do.call(rbind, lapply(l, function(x) {
data.frame(A = x[[1]], B = unlist(x[[2]]), C = names(x[[2]]))
}))
# A B C
#a1 a a1v a1
#b1 b b1v b1
#b2 b b2v b2
#c11 c c1v1 c1
#c12 c c1v2 c1
#c13 c c1v3 c1
Since this is also one of the solution, I will post it here as well. This one is the one I can relate to.
map_df(l, ~ tibble(A=.x[[1]], B=names(.x[[2]]), C= unlist(.x[[2]])))
Read:
Run through all elements of l and make a data.frame (map_df and ~ inside) from a sub-data.frame created by tibble where column A = ..., B = ..`, ...
Thanks go to:
#akrun for prompt answer, I could have used the solution, but was
too busy to figure out.
#A5C1D2H2I1M1N2O1R2T1 also provided a
performant answer.
#Ronak Shah provided a plain R base
solution that I can translate to this.

In R is there a way to recode the columns from one data frame with values from another data frame?

I am still relatively new to working in R and I am not sure how to approach this problem. Any help or advice is greatly appreciated!!!
The problem I have is that I am working with two data frames and I need to recode the first data frame with values from the second. The first data frame (df1) contains the data from the respondents to a survey and the other data frame(df2) is the data dictionary for df1.
The data looks like this:
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b1","b2","b3","c1","c2","c3"))
So far I can manually recode df1 to get the expected output by doing this:
df1 <- within(df1,{
a[a==1] <- "a1"
a[a==2] <- "a2"
a[a==3] <- "a3"
b[b==4] <- "b4"
b[b==5] <- "b5"
b[b==6] <- "b6"
c[c==7] <- "c7"
c[c==8] <- "c8"
c[c==9] <- "c9"
})
However my real dataset has about 42 columns that need to be recoded and that method is a little time intensive. Is there another way in R for me to recode the values in df1 with the values in df2?
Thanks!
Just need to transform the shape a bit.
library(data.table)
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b4","b5","b6","c7","c8","c9"),stringsAsFactors = FALSE)
setDT(df1)
setDT(df2)
df1[,ID:=.I]
ldf1 <- melt(df1,measure.vars = c("a","b","c"),variable.name = "columnIndicator",value.name = "df1_value")
ldf1[df2,"new_value":=i.new_value,on=.(columnIndicator,df1_value)]
ldf1
#> ID columnIndicator df1_value new_value
#> 1: 1 a 1 a1
#> 2: 2 a 2 a2
#> 3: 3 a 3 a3
#> 4: 1 b 4 b4
#> 5: 2 b 5 b5
#> 6: 3 b 6 b6
#> 7: 1 c 7 c7
#> 8: 2 c 8 c8
#> 9: 3 c 9 c9
dcast(ldf1,ID~columnIndicator,value.var = "new_value")
#> ID a b c
#> 1: 1 a1 b4 c7
#> 2: 2 a2 b5 c8
#> 3: 3 a3 b6 c9
Created on 2020-04-18 by the reprex package (v0.3.0)
In base R, we can unlist df1 match it with df1_value and get corresponding new_value.
df1[] <- df2$new_value[match(unlist(df1), df2$df1_value)]
df1
# a b c
#1 a1 b1 c1
#2 a2 b2 c2
#3 a3 b3 c3
Is this what you are looking for???
library(dplyr)
df3 <- df1 %>% gather(key = "key", value = "value")
df3 %>% inner_join(df2, by = c("key" = "columnIndicator", "value" = "df1_value"))
Output
key value new_value
1 a 1 a1
2 a 2 a2
3 a 3 a3
4 b 4 b1
5 b 5 b2
6 b 6 b3
7 c 7 c1
8 c 8 c2
9 c 9 c3

add row based on variable condition in R

I have df as follow
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa
I want add new to when "ID" changes with the value of F for "type" and "other-col" columns
new_df
ID
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
1 F F <- this row added
2 A1 cc
2 B1 aa
2 F F <- this row added
3 A2 aa
how can I do it in R?
thx
This should be doable in a single replacement operation once you know the indexes of where each change occurs. E.g.:
idx <- match(unique(df$ID), df$ID)[-1] - 1
df <- df[sort(c(sequence(nrow(df)),idx)),]
df[seq_along(idx) + idx, c("type","other_col")] <- "F"
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
Where df was:
df <- read.table(text="ID type other_col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa", header=TRUE, stringsAsFactors=FALSE)
An option with group_split and add_row. We can split by 'ID' with group_split into a list of data.frames, then loop through the list with map, add a row as the last row (add_row - by default adds row to the end, but we can control it with .before and .after), then slice out the last row as the last 'ID' didn't need the 'F' row
library(tidyverse)
df1 %>%
group_split(ID) %>%
map_dfr(~ .x %>%
add_row(ID = first(.$ID), type = 'F', `other-col` = 'F')) %>%
slice(-n())
Here is another approach with a similar idea as #akrun's answer.
library(tidyverse)
dat2 <- dat %>%
split(f = .$ID) %>%
map_if(.p = function(x) unique(x$ID) < max(dat$ID),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F"))) %>%
bind_rows()
dat2
# ID type other.col
# 1 1 A1 cc
# 2 1 A2 dd
# 3 1 A3 cc
# 4 1 F F
# 5 2 A1 cc
# 6 2 B1 aa
# 7 2 F F
# 8 3 A2 aa
Data
dat <- read.table(text = "ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Update
I provided an updated answer to show that if ID column is not integer but character, we can create a new column (ID2 in this case) that is converted to be factor based on ID, and then convert it to integer. The rest of the operation would be similar to the original answer but based on ID2.
library(tidyverse)
dat2 <- dat %>%
mutate(ID2 = as.integer(factor(ID, levels = unique(.$ID)))) %>%
split(f = .$ID2) %>%
map_if(.p = function(x) unique(x$ID2) != unique(last(.)$ID2),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F",
ID2 = unique(.x$ID2)))) %>%
bind_rows() %>%
select(-ID2)
dat2
# ID type other.col
# 1 C A1 cc
# 2 C A2 dd
# 3 C A3 cc
# 4 C F F
# 5 A A1 cc
# 6 A B1 aa
# 7 A F F
# 8 B A2 aa
DATA
dat <- read.table(text = "ID type other-col
C A1 cc
C A2 dd
C A3 cc
A A1 cc
A B1 aa
B A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Similar to akrun's answer but in base R. Basically, split dataframe by ID then rbind extra row to each split, then recombine dataframe and remove unrequired last row using head(..., -1) -
head(n = -1,
do.call(rbind,
lapply(split(dat, dat$ID), function(x) {
rbind(x, c(x$ID[1], "F", "F"))
})
)
)
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
Using base R you could do:
cbind(ID=sort(c(dat$ID,unique(dat$ID))),do.call(rbind,by(dat[-1],dat[1],rbind,'F')))
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
3.2 3 F F
Or you could do:
do.call(rbind,by(dat,dat$ID,function(x)cbind(ID = unique(x[,1]),rbind(x[-1],"F"))))
inds = head(cumsum(with(rle(df$ID), unlist(lapply(lengths, function(i) c((rep(1, i)), F = 0))))), -1)
df1 = df[inds,]
df1[which(names(inds) == "F"), c("type", "other_col")] = "F"
df1
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
A possible approach using data.table:
library(data.table)
m <- setDT(df)[, max(ID)]
df[, if (.BY$ID < m) rbind(.SD, as.list(rep("F", ncol(.SD)))) else .SD, ID]
output:
ID type other-col
1: 1 A1 cc
2: 1 A2 dd
3: 1 A3 cc
4: 1 F F
5: 2 A1 cc
6: 2 B1 aa
7: 2 F F
8: 3 A2 aa
or if you dont mind adding another row at the bottom, code will be shorter: setDT(df)[, rbind(.SD, as.list(rep("F", ncol(.SD)))), ID]

access first row of group_by dataset

I have a dataframedf1 with columns a,b,c. I want to assign c=0 to the first row of the dataset returned by group_by(a,b). I tried something like
t <- df1 %>% group_by(a,b) %>% filter(row_number(a)==1) %>% mutate(c= 0)
But it reduced number of rows. Expected output is
a b c
a1 b1 0
a1 b1 NA
a2 b2 0
a2 b2 NA
You can use seq_along to number elements in each group from 1 to the total number of elements within each group (2, in this case). Then use ifelse to set the first element of 'c' for each group to be 0 and leave the other element as is.
library(dplyr)
df %>%
group_by(a, b) %>%
mutate(c = ifelse(seq_along(c) == 1, 0, c))
# A tibble: 4 x 3
# Groups: a, b [2]
# a b c
# <fct> <fct> <dbl>
#1 a1 b1 0.
#2 a1 b1 NA
#3 a2 b2 0.
#4 a2 b2 NA
data
df <- data.frame(a = rep(c("a1", "a2"), each = 2),
b = rep(c("b1", "b2"), each = 2),
c = NA)
df
# a b c
#1 a1 b1 NA
#2 a1 b1 NA
#3 a2 b2 NA
#4 a2 b2 NA

Resources