There is a data frame with multiple values in one column.
I want to change the rows and columns of this data frame.
like this..
data:
result:
What should I do?
Please try this
wide_df <- df %>% group_by(type) %>%
mutate(row= row_number()) %>%
pivot_wider(names_from = type,values_from = value) %>%
select(-row)
Edit I need to check your output in your case, suppose when two rows 1, D, D2 and 2, A, A1 are added in input
Old answer Actually you want to distribute every available value of cols A to D for each id. So I used a temporary code to calculate the number of rows that have to be generated for each id by tmp code first. Thereafter I gathered values by pivoting wider the data and lastly replicating the list to desired number of times.
Follow it like this..
#load libraries
library(dplyr)
library(tidyr)
#calculate number of rows to generate
tmp <- df %>% group_by(id, type) %>%
mutate(tmp = n()) %>%
summarise(tmp = max(tmp)) %>%
group_by(id) %>%
summarise(tmp = prod(tmp))
#store this value in variable n
n <- tmp$tmp
#final code
df %>% pivot_wider(names_from = type, values_from = value,
values_fn = function(x){
l <- list(x)
list(rep(l[[1]], n/length(l[[1]])))
}) %>%
unnest(-id)
# A tibble: 6 x 5
id A B C D
<int> <chr> <chr> <chr> <chr>
1 1 A1 B1 C1 D1
2 1 A1 B2 C2 D1
3 1 A1 B1 C3 D1
4 1 A1 B2 C1 D1
5 1 A1 B1 C2 D1
6 1 A1 B2 C3 D1
dput used
df <- structure(list(id = c(1L, 1L, 1L, 1L, 1L, 1L, 1L), type = c("A",
"B", "B", "C", "C", "C", "D"), value = c("A1", "B1", "B2", "C1",
"C2", "C3", "D1")), class = "data.frame", row.names = c(NA, -7L
))
I think you can do this with unstack and expand.grid in case order does not matter and id is not used:
expand.grid(unstack(x[3:2]))
# A B C D
#1 A1 B1 C1 D1
#2 A1 B2 C1 D1
#3 A1 B1 C2 D1
#4 A1 B2 C2 D1
#5 A1 B1 C3 D1
#6 A1 B2 C3 D1
Data:
x <- data.frame(id = 1, type = c("A", "B", "B", "C", "C", "C", "D")
, value = c("A1", "B1", "B2", "C1", "C2", "C3", "D1"))
Sadly, the posted answers didn't get the right results.
So I got the results my way, But I don't know if this is an efficient way.
df <- data.frame(id = 1, type = c("A", "B", "B", "C", "C", "C", "D")
, val = c("A1", "B1", "B2", "C1", "C2", "C3", "D1"))
tmp <- aggregate(val~id+type, df, toString)
result_df <- tmp %>% spread(key = "type", value = "val") %>%
separate_rows(A, sep=",") %>% separate_rows(B, sep=",") %>% separate_rows(C, sep=",") %>% separate_rows(D, sep=",")
result:
Anyway, thank you all so much!
Related
This question already has answers here:
Add count of unique / distinct values by group to the original data
(3 answers)
Closed 7 months ago.
I have the following situation:
V1
V2
A
A1
A
A1
A
A1
A
A2
A
A2
A
A3
B
B1
B
B2
B
B2
and i need to group by V1, and summarise counting how many distinct groups each V1 level has in V2. Something like this:
V1
n
A
3
B
2
How can i use dplyr funcitons to solve that?
Thanks!!
We can use rle after grouping by 'V1'
library(dplyr)
df1 %>%
group_by(V1) %>%
summarise(n = length(rle(V2)$values), .groups = 'drop')
-output
# A tibble: 2 × 2
V1 n
<chr> <int>
1 A 3
2 B 2
Or with rleid and n_distinct
library(data.table)
df1 %>%
group_by(V1) %>%
summarise(n = n_distinct(rleid(V2)))
# A tibble: 2 × 2
V1 n
<chr> <int>
1 A 3
2 B 2
data
df1 <- structure(list(V1 = c("A", "A", "A", "A", "A", "A", "B", "B",
"B"), V2 = c("A1", "A1", "A1", "A2", "A2", "A1", "B1", "B2",
"B2")), class = "data.frame", row.names = c(NA, -9L))
I'm trying to do multiple merges/joins onto different columns in the same dataset, but when I do so the output is entirely wrong.
df1 df2
P1 P2 P3 P4 P Output
A B C C 1
A B B 2
E F G H H 3
E E 4
I'm trying to merge df2 onto df1 and the output I would like to get would look like
df3
P1 P2 P3 P4 Output
A B C NA 1
A B NA NA 2
E F G H 3
E NA NA NA 4
I've tried
df3<- merge(df1,df2, by.x = "P1", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P2", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P3", by.y = "P", all.x = T, all.y = T)
df3<- merge(df1,df2, by.x = "P4", by.y = "P", all.x = T, all.y = T)
however it doesn't work the way I think it should. Is there an easier function that can cleanly merge like this that I am not aware of?
Based on the output showed, it seems that for each row, we need to get the last non-NA element and do a match with the second data.frame 'P' column to get the corresponding 'Output'. If that is the logic,
df3 <- df1
df3$Output <- apply(df1, 1, function(x)
setNames(df2$Output, df2$P)[tail(x[!is.na(x)], 1)])
Or with tidyverse
library(dplyr)
library(tidyr)
df1 %>%
mutate(rn = row_number()) %>%
pivot_longer(cols = -rn, values_drop_na = TRUE) %>%
group_by(rn) %>%
slice(n()) %>%
ungroup %>%
left_join(df2, by = c('value' = 'P')) %>%
select(Output) %>%
bind_cols(df1, .)
data
df1 <- structure(list(P1 = c("A", "A", "E", "E"), P2 = c("B", "B", "F",
NA), P3 = c("C", NA, "G", NA), P4 = c(NA, NA, "H", NA)), class = "data.frame",
row.names = c(NA,
-4L))
df2 <- structure(list(P = c("C", "B", "H", "E"), Output = 1:4),
class = "data.frame", row.names = c(NA,
-4L))
You can use coalesce from the dplyr package to create a new field in df1 which will be the key between the two datasets.
library(dplyr)
#create column P, which takes first non null value
df1$P <- coalesce(df1$P4,df1$P3,df1$P2,df1$P1)
#Join data frames on P
df3 <- inner_join(df1, df2, by='P')
#Rmove P from df3
df3$P <- NULL
>> df3
P1 P2 P3 P4 Output
1 A B C <NA> 1
2 A B <NA> <NA> 2
3 E F G H 3
4 E <NA> <NA> <NA> 4
I am having some trouble with joining data frames with dplyr, where I would like to ignore the NAs.
The data that I have is quite big, but a simplified version looks like:
id <- c("id1", "id2", "id3", "id4")
A <- c("E", "F", "G", NA)
B <- c("T", NA, "N", "T")
C <- c(NA, "T", "U", NA)
df <- data.frame(A, B, C)
id A B C
1 id1 E T NA
2 id2 F NA T
3 id3 G N U
4 id4 NA T NA
I have an entry that I would like to match with df, which is e.g.:
df2 <- data.frame(A = "E", B = "T", C = "M")
A B C
1 E T M
As a result I would like to obtain all rows from df that match with df2, but the NAs should be ignored. So the result should look like this:
id A B C
1 id1 E T NA
2 id4 NA T NA
I was trying to do this with semi_join, but it did not work so far:
result <- df %>%
group_by(n = seq(n())) %>%
do(modify_if(., is.na, ~NULL) %>%
semi_join(df2, by = c("A", "B", "C"))) %>%
ungroup %>%
select(-n)
Which results in:
Error: `by` can't contain join column `C` which is missing from LHS
Call `rlang::last_error()` to see a backtrace
Who knows the answer?
Here's a solution with a mix of tidyverse and base R. I think this is pretty clear, but I'd be interested in a pure tidyverse implementation that isn't completely contrived.
The idea is to first expand all entries in df and df2 and then filter through all the columns using a loop.
The data:
id <- c("id1", "id2", "id3", "id4")
A <- c("E", "F", "G", NA)
B <- c("T", NA, "N", "T")
C <- c(NA, "T", "U", NA)
df <- data.frame(id, A, B, C, stringsAsFactors = F) # Make sure to use strings not factors
df2 <- data.frame(A = "E", B = "T", C = "M", stringsAsFactors = F)
Code:
library(tidyr)
results <- crossing(df, df2)
select_columns <- c("A", "B", "C")
for(col in select_columns) {
keep <- is.na(results[[col]]) | results[[col]] == results[[paste0(col, 1)]]
results <- results[keep,, drop=F]
}
results <- results %>% dplyr::select(id, A:C) %>% distinct
results
id A B C
1 id1 E T <NA>
2 id4 <NA> T <NA>
If you only need to do this for a single set of values this is probably the most straightforward approach:
d[A %in% c("E",NA) & B %in%c("T",NA) & C %in% c("M",NA),]
Another example using tidyverse and base (dplyr, tidyr, base):
In this I convert your df2 into a dataframe that includes all combinations of values you want to accept ( (E or NA) & (T or NA) & (M or NA) ) and then I do an inner join with this full set. There are other ways to create a dataframe of all possible combinations but this one uses tidyr fairly easily.
library(dplyr)
library(tidyr)
id <- c("id1", "id2", "id3", "id4")
A <- c("E", "F", "G", NA)
B <- c("T", NA, "N", "T")
C <- c(NA, "T", "U", NA)
df <- data.frame(A, B, C, stringsAsFactors = FALSE)
df2 <- data.frame(A = "E", B = "T", C = "M",stringsAsFactors = FALSE)
df2_expanded <- df2 %>%
rowwise() %>%
mutate(combinations = list(expand.grid(A = c(A,NA),B = c(B,NA),C = c(C,NA),stringsAsFactors = FALSE))) %>%
select(-A,-B,-C) %>%
unnest(combinations)
# A tibble: 8 x 3
# A B C
# <chr> <chr> <chr>
# 1 E T M
# 2 NA T M
# 3 E NA M
# 4 NA NA M
# 5 E T NA
# 6 NA T NA
# 7 E NA NA
# 8 NA NA NA
df %>%
inner_join(df2_expanded)
# A B C
# 1 E T <NA>
# 2 <NA> T <NA>
This is a more complex version of a previous question where I had abstracted the actual problem too much to apply the answers.
R convert tidy hierarchical data frame to hierarchical list
I've converted a hierarchical data frame with two grouping levels into a hierarchical list-grid using a for loop.
Is there a more efficient base R, tidyverse or other approach to achieve this?
In the real dataset:
The grouping variables and description are multi word strings.
The description preface - d# - is in the MWE for ease of checking.
There are 14 associated variables variously of type: character, integer and double
Rules
Group 1 and Group 2 headings to be in description column
Group 1 headings to appear once only
Group 2 heading are children of group 1 heading, and only change when there is a new group 2 heading
Descriptions are children of group 2 headings
From this
g1 g2 desc var1 var2 var3
A a d1 KS3 0.0500 2 PLs
A a d2 CTI 0.0500 9 7O0
A b d3 b8x 0.580 5 he2
A b d4 XOf 0.180 12 XJE
A b d5 ygn 0.900 11 v48
A c d6 dGY 0.770 6 UcH
A d d7 jpG 0.600 4 P5M
B d d8 Z95 0.600 10 j6O
To this
desc var1 var2 var3
A
a
d1 KS3 0.0500 2 PLs
d2 CTI 0.0500 9 7O0
b
d3 b8x 0.580 5 he2
d4 XOf 0.180 12 XJE
d5 ygn 0.900 11 v48
c
d6 dGY 0.770 6 UcH
d
d7 jpG 0.600 4 P5M
B
d
Code
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
tib
# Number of rows in final table
n_rows <- length(unique(tib$g1)) + length(unique(paste0(tib$g1, tib$g2))) + nrow(tib)
# create empty output tibble
output <-
as_tibble(matrix(nrow = n_rows, ncol = ncol(tib)-1)) %>%
rename(id = V1, desc = V2, var1 = V3, var2 = V4, var3 = V5) %>%
mutate(id = NA_character_,
desc = NA_character_,
var1 = NA_real_,
var2 = NA_integer_,
var3 = NA_character_)
# Loop counters
level_1 <- 0
level_2 <- 0
output_row <- 1
for(i in seq_len(nrow(tib))){
# level 1 headings
if(tib$g1[[i]] != level_1) {
output$id[[output_row]] <- "g1"
output$desc[[output_row]] <- tib$g1[[i]]
output_row <- output_row + 1
}
# level 2 headings
if(paste0(tib$g1[[i]], tib$g2[[i]]) != paste0(level_1, level_2)) {
output$id[[output_row]] <- "g2"
output$desc[[output_row]] <- tib$g2[[i]]
output_row <- output_row + 1
}
level_1 <- tib$g1[[i]]
level_2 <- tib$g2[[i]]
# Description and data grid
output$desc[[output_row]] <- tib$desc[[i]]
output$var1[[output_row]] <- tib$var1[[i]]
output$var2[[output_row]] <- tib$var2[[i]]
output$var3[[output_row]] <- tib$var3[[i]]
output_row <- output_row + 1
}
output
Adapting the answer from tyluRp R convert tidy hierarchical data frame to hierarchical list I've hit on a solution.
library(tidyverse)
library(stringi)
set.seed(2018)
tib <- tibble(g1 = c("A", "A", "A", "A", "A", "A", "A", "B", "B", "B", "B", "C"),
g2 = c("a", "a", "b", "b", "b", "c", "d", "d", "b", "b", "e", "e"),
desc = paste0("d", 1:12, " ", stri_rand_strings(12, 3)),
var1 = round(runif(12), 2),
var2 = sample.int(12),
var3 = stri_rand_strings(12, 3))
# add unique identifier for description and variable rows
tib <-
tib %>%
rowid_to_column() %>%
mutate(rowid = paste0("z_", rowid))
# separate tibble for variables associated with descriptions
tib_var <-
tib %>%
select(rowid, var1, var2, var3)
# code adapted from tyluRp to reorder the data and add description variables
tib <-
tib %>%
select(g1, g2, desc, rowid) %>%
mutate(g2 = paste(g1, g2, sep = "_")) %>%
transpose() %>%
unlist() %>%
stack() %>%
distinct(values, ind) %>%
mutate(detect_var = str_detect(values, "^z_"),
ind = lead(case_when(detect_var == TRUE ~ values)),
values = case_when(detect_var == TRUE ~ NA_character_,
TRUE ~ values))%>%
drop_na(values) %>%
select(values, ind) %>%
mutate(values = str_remove(values, "\\D_")) %>%
left_join(tib_var, by = c("ind" = "rowid")) %>%
select(-ind) %>%
replace_na(list(var1 = "", var2 = "", var3 = ""))
I have a df
id a1 a2
1 x1 y1
2 x2 y2
and another dataframe df2
id name1 name2
1 a1 b1
1 a2 b2
2 a3 b3
3 a4 b4
3 a5 b5
df2 could contain multiple records of unique id's from df1.
I need to join the dataframes in such a way that for each row of df1, i should have one column from first record of df2 and if it exists, second column from second record.
To explain, the output should be like :
id a1 a2 n1 n2
1 x1 y1 a1 a2
2 x2 y2 a3 NA
For doing this I have split df2 on id using split
s <- split(df2, df2$id)
but i'm unsure how to use sapply over that. Any pointers for this
If we are not taking the 'name2' column
library(dplyr)
df2 %>%
filter(id %in% df$id) %>%
select(-name2) %>%
group_by(id) %>%
mutate(rn = paste0("n", row_number())) %>%
spread(rn, name1) %>%
left_join(df, .)
# id a1 a2 n1 n2
#1 1 x1 y1 a1 a2
#2 2 x2 y2 a3 <NA>
data
df <- structure(list(id = 1:2, a1 = c("x1", "x2"), a2 = c("y1", "y2"
)), .Names = c("id", "a1", "a2"), class = "data.frame", row.names = c(NA,
-2L))
df2 <- structure(list(id = c(1L, 1L, 2L, 3L, 3L), name1 = c("a1", "a2",
"a3", "a4", "a5"), name2 = c("b1", "b2", "b3", "b4", "b5")), .Names = c("id",
"name1", "name2"), class = "data.frame", row.names = c(NA, -5L))