I got a nested list l with each item each self is a 2 level list. For example:
l1 = list("a", list("a1"= "a1v"))
l2 = list("b", list("b1" = "b1v", b2 = "b2v"))
l3 = list("c", list("c1" = c("c1v1", "c1v2", "c1v3")))
l = list(l1, l2, l3)
How do I tranform it to a data.frame like this:
df = data.frame(A = c("a", "b", "b", "c", "c", "c"), B= c("a1", "b1", "b2", "c1", "c1", "c1"), C=c("a1v", "b1v", "b2v", "c1v1", "c1v2", "c1v3"))
> df
A B C
1 a a1 a1v
2 b b1 b1v
3 b b2 b2v
4 c c1 c1v1
5 c c1 c1v2
6 c c1 c1v3
Tried with seperate_rows and map_df but both failed to deal with inconsistent dimension of .x[[2]] items.
Update 1:
#akrun's solution is not running for me:
We could use bind_rows with map
library(purrr)
library(dplyr)
library(tidyr)
map_dfr(l, ~bind_cols(.x) %>%
pivot_longer(cols = -1, names_to = 'B', values_to = 'C') %>%
rename_at(1, ~'A'))
# A tibble: 6 x 3
# A B C
#* <chr> <chr> <chr>
#1 a a1 a1v
#2 b b1 b1v
#3 b b2 b2v
#4 c c1 c1v1
#5 c c1 c1v2
#6 c c1 c1v3
If the sample data in your question accurately reflects your actual data, you can try one of the following:
library(data.table)
data.table(l)[, list(names(unlist(l)),
unlist(l, use.names = FALSE))][
, V3 := V2[1], cumsum(V1 == "")][V1 != ""]
## V1 V2 V3
## 1: a1 a1v a
## 2: b1 b1v b
## 3: b2 b2v b
## 4: c11 c1v1 c
## 5: c12 c1v2 c
## 6: c13 c1v3 c
reshape2::melt(setNames(lapply(l, "[[", -1), lapply(l, "[[", 1)))
## value L2 L1
## 1 a1v a1 a
## 2 b1v b1 b
## 3 b2v b2 b
## 4 c1v1 c1 c
## 5 c1v2 c1 c
## 6 c1v3 c1 c
Base R option :
do.call(rbind, lapply(l, function(x) {
data.frame(A = x[[1]], B = unlist(x[[2]]), C = names(x[[2]]))
}))
# A B C
#a1 a a1v a1
#b1 b b1v b1
#b2 b b2v b2
#c11 c c1v1 c1
#c12 c c1v2 c1
#c13 c c1v3 c1
Since this is also one of the solution, I will post it here as well. This one is the one I can relate to.
map_df(l, ~ tibble(A=.x[[1]], B=names(.x[[2]]), C= unlist(.x[[2]])))
Read:
Run through all elements of l and make a data.frame (map_df and ~ inside) from a sub-data.frame created by tibble where column A = ..., B = ..`, ...
Thanks go to:
#akrun for prompt answer, I could have used the solution, but was
too busy to figure out.
#A5C1D2H2I1M1N2O1R2T1 also provided a
performant answer.
#Ronak Shah provided a plain R base
solution that I can translate to this.
Related
I have example data as follows:
library(data.table)
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B NA
E NA")
For the two rows that have Variable_codes_2022==NA, I would like to increment the variable code so that it becomes:
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B B4
E E2"
Because the column Variable_codes_2022 is a string variable, the numbers are not in numerical order.
I have no idea where to start and I was wondering if someone could help me on the right track.
We could do it this way:
grouping
arranging and
mutate.
To keep the original order we could first create and id and then rearrange:
library(dplyr)
dat %>%
group_by(Survey) %>%
arrange(.by_group = TRUE) %>%
mutate(Variable_codes_2022 = paste0(Survey, row_number()))
Survey Variable_codes_2022
<chr> <chr>
1 A A1
2 B B1
3 B B2
4 B B3
5 B B4
6 D D1
7 E E1
8 E E2
data.table option using rleid like this:
library(data.table)
dat[, Variable_codes_2022 := paste0(Survey, rleid(Variable_codes_2022)), by = Survey]
dat
#> Survey Variable_codes_2022
#> 1: D D1
#> 2: A A1
#> 3: B B1
#> 4: B B2
#> 5: B B3
#> 6: E E1
#> 7: B B4
#> 8: E E2
Created on 2022-12-01 with reprex v2.0.2
dat <-
structure(list(survey = c("D", "A", "B", "B", "B", "E", "B",
"E", "B"), var_code = c("D1", "A1", "B1", "B3", "B2", "E1", NA,
NA, NA)), row.names = c(NA, -9L), class = c("data.table", "data.frame"
), .internal.selfref = <pointer: 0x0000026db10f1ef0>)
library(dplyr)
library(stringr)
dat %>%
group_by(survey) %>%
mutate(
aux1 = as.numeric(stringr::str_remove(var_code,survey)),
aux2 = cumsum(is.na(var_code)),
var_code = paste0(survey,max(aux1,na.rm = TRUE)+aux2)
) %>%
ungroup() %>%
select(-aux1,-aux2)
# A tibble: 9 x 2
survey var_code
<chr> <chr>
1 D D1
2 A A1
3 B B3
4 B B3
5 B B3
6 E E1
7 B B4
8 E E2
9 B B5
This solution with rowid.
Added an extra element to the sample so it can be tested against multiple missings
library(data.table)
#> Warning: package 'data.table' was built under R version 4.2.2
dat <- fread("Survey Variable_codes_2022
D D1
A A1
B B1
B B3
B B2
E E1
B NA
E NA
E NA")
dat[, n := as.numeric(substr(
Variable_codes_2022, nchar(Survey)+1, nchar(Variable_codes_2022)))]
dat[is.na(n),
Variable_codes_2022 := paste0(Survey, rowid(Survey) +
dat[.SD[,.(Survey)], .(m=max(n, na.rm=T)), on = "Survey", by=.EACHI ][,m])]
dat
#> Survey Variable_codes_2022 n
#> 1: D D1 1
#> 2: A A1 1
#> 3: B B1 1
#> 4: B B3 3
#> 5: B B2 2
#> 6: E E1 1
#> 7: B B4 NA
#> 8: E E2 NA
#> 9: E E3 NA
I have two dataframes that I want to combine, for each possible combination.
Basically, I dataframes like this:
> table1 = data.frame(a1 = c("a","b"), a2 = c("c", "d"))
> table1
a1 a2
1 a c
2 b d
> table2 = data.frame(b1 = c("e", "f"), b2 = c("g", "h"))
> table2
b1 b2
1 e g
2 f h
and I want to get a result like this:
> combinedtable = data.frame(a1 = c("a","a", "b","b"), a2 = c("c", "c", "d", "d"), b1 = c("e", "f","e", "f"), b2 = c("g", "h","g", "h"))
> combinedtable
a1 a2 b1 b2
1 a c e g
2 a c f h
3 b d e g
4 b d f h
Is there a neat way to do this? What I eventually want to do is to run an lapply on the resulting table. Otherwise I need to write a function like:
for each row in X, apply this function for each row in Y.
Combining first seems more efficient.
base R
with(expand.grid(a=seq_len(nrow(table1)), b=seq_len(nrow(table2))),
cbind(table1[a,], table2[b,]))
# a1 a2 b1 b2
# 1 a c e g
# 2 b d e g
# 1.1 a c f h
# 2.1 b d f h
or
merge(table1, table2, by = NULL)
# a1 a2 b1 b2
# 1 a c e g
# 2 b d e g
# 3 a c f h
# 4 b d f h
dplyr
Similar to the by=NULL method, we can do
dplyr::full_join(table1, table2, by = character())
Here is my approach with purrr:
purrr::pmap_dfr(table1, ~ data.frame(..., table2))
Returning:
a1 a2 b1 b2
1 a c e g
2 a c f h
3 b d e g
4 b d f h
The other answers have it for each combination of a1 and a2. I read this question a little differently so just in case: if you want a table with every combination of all levels of all columns:
cbind(table1, table2) %>%
complete(a1, a2, nesting(b1, b2))
I am still relatively new to working in R and I am not sure how to approach this problem. Any help or advice is greatly appreciated!!!
The problem I have is that I am working with two data frames and I need to recode the first data frame with values from the second. The first data frame (df1) contains the data from the respondents to a survey and the other data frame(df2) is the data dictionary for df1.
The data looks like this:
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b1","b2","b3","c1","c2","c3"))
So far I can manually recode df1 to get the expected output by doing this:
df1 <- within(df1,{
a[a==1] <- "a1"
a[a==2] <- "a2"
a[a==3] <- "a3"
b[b==4] <- "b4"
b[b==5] <- "b5"
b[b==6] <- "b6"
c[c==7] <- "c7"
c[c==8] <- "c8"
c[c==9] <- "c9"
})
However my real dataset has about 42 columns that need to be recoded and that method is a little time intensive. Is there another way in R for me to recode the values in df1 with the values in df2?
Thanks!
Just need to transform the shape a bit.
library(data.table)
df1 <- data.frame(a = c(1,2,3),
b = c(4,5,6),
c = c(7,8,9))
df2 <- data.frame(columnIndicator = c("a","a","a","b","b","b","c","c","c" ),
df1_value = c(1,2,3,4,5,6,7,8,9),
new_value = c("a1","a2","a3","b4","b5","b6","c7","c8","c9"),stringsAsFactors = FALSE)
setDT(df1)
setDT(df2)
df1[,ID:=.I]
ldf1 <- melt(df1,measure.vars = c("a","b","c"),variable.name = "columnIndicator",value.name = "df1_value")
ldf1[df2,"new_value":=i.new_value,on=.(columnIndicator,df1_value)]
ldf1
#> ID columnIndicator df1_value new_value
#> 1: 1 a 1 a1
#> 2: 2 a 2 a2
#> 3: 3 a 3 a3
#> 4: 1 b 4 b4
#> 5: 2 b 5 b5
#> 6: 3 b 6 b6
#> 7: 1 c 7 c7
#> 8: 2 c 8 c8
#> 9: 3 c 9 c9
dcast(ldf1,ID~columnIndicator,value.var = "new_value")
#> ID a b c
#> 1: 1 a1 b4 c7
#> 2: 2 a2 b5 c8
#> 3: 3 a3 b6 c9
Created on 2020-04-18 by the reprex package (v0.3.0)
In base R, we can unlist df1 match it with df1_value and get corresponding new_value.
df1[] <- df2$new_value[match(unlist(df1), df2$df1_value)]
df1
# a b c
#1 a1 b1 c1
#2 a2 b2 c2
#3 a3 b3 c3
Is this what you are looking for???
library(dplyr)
df3 <- df1 %>% gather(key = "key", value = "value")
df3 %>% inner_join(df2, by = c("key" = "columnIndicator", "value" = "df1_value"))
Output
key value new_value
1 a 1 a1
2 a 2 a2
3 a 3 a3
4 b 4 b1
5 b 5 b2
6 b 6 b3
7 c 7 c1
8 c 8 c2
9 c 9 c3
I have df as follow
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa
I want add new to when "ID" changes with the value of F for "type" and "other-col" columns
new_df
ID
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
1 F F <- this row added
2 A1 cc
2 B1 aa
2 F F <- this row added
3 A2 aa
how can I do it in R?
thx
This should be doable in a single replacement operation once you know the indexes of where each change occurs. E.g.:
idx <- match(unique(df$ID), df$ID)[-1] - 1
df <- df[sort(c(sequence(nrow(df)),idx)),]
df[seq_along(idx) + idx, c("type","other_col")] <- "F"
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
Where df was:
df <- read.table(text="ID type other_col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa", header=TRUE, stringsAsFactors=FALSE)
An option with group_split and add_row. We can split by 'ID' with group_split into a list of data.frames, then loop through the list with map, add a row as the last row (add_row - by default adds row to the end, but we can control it with .before and .after), then slice out the last row as the last 'ID' didn't need the 'F' row
library(tidyverse)
df1 %>%
group_split(ID) %>%
map_dfr(~ .x %>%
add_row(ID = first(.$ID), type = 'F', `other-col` = 'F')) %>%
slice(-n())
Here is another approach with a similar idea as #akrun's answer.
library(tidyverse)
dat2 <- dat %>%
split(f = .$ID) %>%
map_if(.p = function(x) unique(x$ID) < max(dat$ID),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F"))) %>%
bind_rows()
dat2
# ID type other.col
# 1 1 A1 cc
# 2 1 A2 dd
# 3 1 A3 cc
# 4 1 F F
# 5 2 A1 cc
# 6 2 B1 aa
# 7 2 F F
# 8 3 A2 aa
Data
dat <- read.table(text = "ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Update
I provided an updated answer to show that if ID column is not integer but character, we can create a new column (ID2 in this case) that is converted to be factor based on ID, and then convert it to integer. The rest of the operation would be similar to the original answer but based on ID2.
library(tidyverse)
dat2 <- dat %>%
mutate(ID2 = as.integer(factor(ID, levels = unique(.$ID)))) %>%
split(f = .$ID2) %>%
map_if(.p = function(x) unique(x$ID2) != unique(last(.)$ID2),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F",
ID2 = unique(.x$ID2)))) %>%
bind_rows() %>%
select(-ID2)
dat2
# ID type other.col
# 1 C A1 cc
# 2 C A2 dd
# 3 C A3 cc
# 4 C F F
# 5 A A1 cc
# 6 A B1 aa
# 7 A F F
# 8 B A2 aa
DATA
dat <- read.table(text = "ID type other-col
C A1 cc
C A2 dd
C A3 cc
A A1 cc
A B1 aa
B A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Similar to akrun's answer but in base R. Basically, split dataframe by ID then rbind extra row to each split, then recombine dataframe and remove unrequired last row using head(..., -1) -
head(n = -1,
do.call(rbind,
lapply(split(dat, dat$ID), function(x) {
rbind(x, c(x$ID[1], "F", "F"))
})
)
)
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
Using base R you could do:
cbind(ID=sort(c(dat$ID,unique(dat$ID))),do.call(rbind,by(dat[-1],dat[1],rbind,'F')))
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
3.2 3 F F
Or you could do:
do.call(rbind,by(dat,dat$ID,function(x)cbind(ID = unique(x[,1]),rbind(x[-1],"F"))))
inds = head(cumsum(with(rle(df$ID), unlist(lapply(lengths, function(i) c((rep(1, i)), F = 0))))), -1)
df1 = df[inds,]
df1[which(names(inds) == "F"), c("type", "other_col")] = "F"
df1
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
A possible approach using data.table:
library(data.table)
m <- setDT(df)[, max(ID)]
df[, if (.BY$ID < m) rbind(.SD, as.list(rep("F", ncol(.SD)))) else .SD, ID]
output:
ID type other-col
1: 1 A1 cc
2: 1 A2 dd
3: 1 A3 cc
4: 1 F F
5: 2 A1 cc
6: 2 B1 aa
7: 2 F F
8: 3 A2 aa
or if you dont mind adding another row at the bottom, code will be shorter: setDT(df)[, rbind(.SD, as.list(rep("F", ncol(.SD)))), ID]
I have a dataframe:
df <- data.frame(id = c('1','2','3'), b = c('b1', '', 'b3'), c = c('c1', 'c2', ''), d = c('d1', '', ''))
id b c d
1 b1 c1 d1
2 c2
3 b3
where the row with id-1 is filled with all data with no empty column values. I want to copy all cell values from id-1 into id 2 and 3 if there are missing values in those cells from rows 2 & 3. Final output something like:
df2 <- data.frame(id = c('1','2','3'), b = c('b1', 'b1', 'b3'), c = c('c1', 'c2', 'c1'), d = c('d1', 'd1', 'd1'))
id b c d
1 b1 c1 d1
2 b1 c2 d1
3 b3 c1 d1
Thank you for your help in advance
Use some matrix indexing to get the "" cases and then overwrite selecting the appropriate column from the first row of df:
idx <- which(df[-1]=="", arr.ind=TRUE)
df[-1][idx] <- unlist(df[1,-1][idx[,"col"]])
# id b c d
#1 1 b1 c1 d1
#2 2 b1 c2 d1
#3 3 b3 c1 d1