Combine similar columns with different character names data? - r

ID <- c("IDa", "IDb","IDc","IDe","IDd","IDe")
names1 <- c("robin", "bob", "eric", "charlie", "robin", "gabby")
matrix1 <- matrix(names1, 1, 6)
colnames(matrix1) <- c("IDa", "IDb", "IDc","IDe", "IDd", "IDe")
This is the output:
IDa
IDb
IDc
IDe
IDd
IDe
robin
bob
eric
charlie
robin
gabby
But I want it to look like this:
IDa
IDb
IDc
IDe
IDd
robin
bob
eric
charlie
robin
gabby

We may split and then cbind after padding with NA
lst1 <- split(names, ID)
do.call(cbind, lapply(lst1, `length<-`, max(lengths(lst1))))
-output
IDa IDb IDc IDd IDe
[1,] "robin" "bob" "eric" "robin" "charlie"
[2,] NA NA NA NA "gabby"

Another option:
library(reshape2)
library(tidyverse)
melt(matrix1) %>%
select(-Var1) %>%
group_by(Var2) %>%
mutate(id = row_number()) %>%
pivot_wider(
names_from = Var2,
values_from = value
) %>%
select(-id)
IDa IDb IDc IDe IDd
<chr> <chr> <chr> <chr> <chr>
1 robin bob eric charlie robin
2 NA NA NA gabby NA

Related

Collapse / Merge multiple rows with non empty cells/ values

I am trying to merge two rows by a similar group which I did by looking at different questions on stack overflow (Question1, Qestion2, Question3). All these questions stated what I want but I also have some empty fields in my data frame and I don't want to merge them. I only want to merge the similar/duplicate rows based on Col1 that contain values and not empty or NA. I use below code but it also merges cells that are empty or NA.
merge_my_rows <- df %>%
group_by(Col1) %>%
summarise(Col2 = paste(Col2, collapse = ","))
Below please is the sample df and Output df that I want.
Col1
Col2
F212
ALICE
D23
John
C64
NA
F212
BOB
C64
NA
D23
JohnY
D19
Marquis
Output df
Col1
Col2
F212
ALICE, BOB
D23
John, JohnY
C64
NA
C64
NA
D19
Marquis
You can set a new grouping column, na.grp, that gives each NA in Col2 a unique number and gives those non-NA elements a common number.
library(dplyr)
df %>%
group_by(Col1, na.grp = ifelse(is.na(Col2), cumsum(is.na(Col2)), 0)) %>%
summarise(Col2 = paste(Col2, collapse = ", "), .groups = "drop") %>%
select(-na.grp)
# # A tibble: 5 × 2
# Col1 Col2
# <chr> <chr>
# 1 C64 NA
# 2 C64 NA
# 3 D19 Marquis
# 4 D23 John, JohnY
# 5 F212 ALICE, BOB
Data
df <- read.table(text = "
Col1 Col2
F212 ALICE
D23 John
C64 NA
F212 BOB
C64 NA
D23 JohnY
D19 Marquis", header = TRUE)
Using reframe
library(dplyr)
df1 %>%
reframe(Col2 = if(all(is.na(Col2))) Col2 else toString(Col2[!is.na(Col2)])
, .by = "Col1")
-output
Col1 Col2
1 F212 ALICE, BOB
2 D23 John, JohnY
3 C64 <NA>
4 C64 <NA>
5 D19 Marquis

Function over every column of list of dfs

I have the following dfs
df1 <- data.frame(name= c("mark", "peter", "lily"), col1= c(1,2,3),col2= c(4,5,6))
df2 <- data.frame(name= c("mark", "liam", "peter"), col1= c(7,8,9),col2= c(1,2,3))
df3 <- data.frame(name= c("felix", "liam", "peter"), col1= c(3,5,8),col2= c(1,5,8))
df4 <- data.frame(name= c("felix", "lily", "liam"), col1= c(6,2,6),col2= c(4,2,2))
df_list <- list(df1,df2,df3,df4)
I use this code for calculations on two consecutive dfs:
It matches the "rows" of two consecutive dfs by "name" and calcs a ratio of col1 between those. It then returns the results as a new column to the dfs.
df_combined <- df1 %>%
left_join(df2, by="name") %>%
mutate(combined=(col1.x/col1.y)) %>%
filter(!is.na(combined)) %>%
select(name,combined)
add_match_column<-function(df){
df %>% left_join(df_combined)
}
df_list_matched <- df_list %>%
map(add_match_column)
Now I want to iterate over all other columns with the same function. Thus, col2 of two consecutive dfs in the next step and so on (my dfs have 100+ columns and all dfs have the same structure).
It appears as though you have panel data, where your observations for each period are in separate data frames. Now you want to calculate for individual i in period t the ratio between x_it and x_i(t-1).
library(tidyverse)
# It is better to import all your df's into one list instead of separately assigning them
d <- list(
data.frame(name= c("mark", "peter", "lily"), col1= c(1,2,3),col2= c(4,5,6)),
data.frame(name= c("mark", "liam", "peter"), col1= c(7,8,9),col2= c(1,2,3)),
data.frame(name= c("felix", "liam", "peter"), col1= c(3,5,8),col2= c(1,5,8)),
data.frame(name= c("felix", "lily", "liam"), col1= c(6,2,6),col2= c(4,2,2))
)
d |>
bind_rows(.id = "t") |> # Bind to one long df
arrange(name, t) |>
complete(name, t) |> # add implicit NA's
group_by(name) |>
mutate(across(where(is.numeric), ~ .x/lag(.x))) |>
ungroup() |>
filter(!is.na(col1))
#> # A tibble: 6 × 4
#> name t col1 col2
#> <chr> <chr> <dbl> <dbl>
#> 1 felix 4 2 4
#> 2 liam 3 0.625 2.5
#> 3 liam 4 1.2 0.4
#> 4 mark 2 7 0.25
#> 5 peter 2 4.5 0.6
#> 6 peter 3 0.889 2.67
Created on 2022-01-18 by the reprex package (v2.0.1)

How can I make a custom aggregation of a dataframe in R?

I have a dataframe such as
group <- c("A", "A", "B", "C", "C")
tx <- c("A-201", "A-202", "B-201", "C-205", "C-206")
feature <- c("coding", "decay", "pending", "coding", "coding")
df <- data.frame(group, tx, feature)
I want to generate a new df with the entries in tx "listed" for each feature. I want the output to look like
group <- c("A", "B", "C")
coding <- c("A-201", NA, "C-205|C-206")
decay <- c("A-202", NA, NA)
pending <- c(NA, "B-201", NA)
df.out <- data.frame(group, coding, decay, pending)
So far I did not find a means to achieve this via a dplyr function. Do I have to loop through my initial df?
You may get the data in wide format using tidyr::pivot_wider and use a function in values_fn -
df.out <- tidyr::pivot_wider(df, names_from = feature, values_from = tx,
values_fn = function(x) paste0(x, collapse = '|'))
df.out
# group coding decay pending
# <chr> <chr> <chr> <chr>
#1 A A-201 A-202 NA
#2 B NA NA B-201
#3 C C-205|C-206 NA NA
Here is an alternative way:
library(dplyr)
library(tidyr)
df %>%
group_by(group, feature) %>%
mutate(tx = paste(tx, collapse = "|")) %>%
distinct() %>%
pivot_wider(
names_from = feature,
values_from = tx
)
group coding decay pending
<chr> <chr> <chr> <chr>
1 A A-201 A-202 NA
2 B NA NA B-201
3 C C-205|C-206 NA NA
Using dcast from data.table
library(data.table)
dcast(setDT(df), group ~ feature, value.var = 'tx',
function(x) paste(x, collapse = "|"), fill = NA)
group coding decay pending
1: A A-201 A-202 <NA>
2: B <NA> <NA> B-201
3: C C-205|C-206 <NA> <NA>

How to find intersect elements of concatenated string?

# create sample df
basket_customer <- c("apple,orange,banana","apple,banana,orange","strawberry,blueberry")
basket_ideal<- c("orange,banana","orange,apple,banana","strawberry,watermelon")
customer_name <- c("john","adam","john")
visit_id <- c("1001","1001","1003")
df2 <- cbind.data.frame(basket_customer,basket_ideal,customer_name,visit_id)
df2$basket_ideal <- as.character(basket_ideal)
df2$basket_customer <- as.character(basket_customer)
The goal is to compare the basket elements (fruits) of each customer to the ideal basket and return the missing fruit.
Note the same visit_id can exists for 1 or more users so the uniqueID is (id+username) and elements are not alphabetically sorted.
expected output:
visit_id
customer_name
NOT_in_basket_ideal
NOT_in_basket_customer
1001
john
apple
NA
1001
adam
NA
NA
1003
john
blueberry
watermelon
I tried using row_wise(),intersect(),except(),and unnesting however did not succeed. Thank you
We could use Map to loop over the corresponding elements of the list columns, and use setdiff to get the elements of the first vector not in the second
cst_list <- strsplit(df2$basket_customer, ",\\s*")
idl_list <- strsplit(df2$basket_ideal, ",\\s*")
lst1 <- Map(function(x, y) if(identical(x, y)) 'equal'
else setdiff(x, y), cst_list, idl_list)
lst1[lengths(lst1) == 0] <- NA_character_
v1 <- sapply(lst1, toString)
and the second case, just reverse the order
lst2 <- Map(function(x, y) if(identical(x, y)) 'equal'
else setdiff(y, x), cst_list, idl_list)
lst2[lengths(lst2) == 0] <- NA_character_
v2 <- sapply(lst2, toString)
Combining the output from both to 'df2'
df2[c("NOT_in_basket_ideal", "NOT_in_basket_customer")] <- list(v1, v2)
-output
df2[-(1:2)]
# customer_name visit_id NOT_in_basket_ideal NOT_in_basket_customer
#1 john 1001 apple NA
#2 adam 1001 NA NA
#3 john 1003 blueberry watermelon
Or in tidyverse
library(dplyr)
library(purrr)
library(stringr)
df2 %>%
mutate(across(starts_with('basket'), ~ str_extract_all(., "\\w+"))) %>%
transmute(customer_name, visit_id,
NOT_in_basket_ideal = map2_chr(basket_customer,
basket_ideal, ~ toString(setdiff(.x, .y))),
NOT_in_basket_customer = map2_chr(basket_ideal, basket_customer,
~ toString(setdiff(.x, .y))))
# customer_name visit_id NOT_in_basket_ideal NOT_in_basket_customer
#1 john 1001 apple
#2 adam 1001
#3 john 1003 blueberry watermelon

R: Reshaping Multiple Columns from Long to Wide

Using following data:
library(tidyverse)
sample_df <- data.frame(Letter = c("a", "a", "a", "b", "b"),
Number = c(1,2,1,3,4),
Fruit = c("Apple", "Plum", "Peach", "Pear", "Peach"))
Letter Number Fruit
a 1 Apple
a 2 Plum
a 1 Peach
b 3 Pear
b 4 Peach
I want to transform a set of values from a long to a wide format:
Letter Number_1 Number_2 Fruit_1 Fruit_2 Fruit_3
a 1 2 Apple Plum Peach
b 3 4 Pear Peach
To do so, I unsuccessfully tried to create an index of each unique group combinations using c("Letter", "Number") and c("Letter", "Fruit"). Firstly, does this index need to be created, and if so how should it be done?
# Gets Unique Values, but no Index of Unique Combinations
sample_df1 <- sample_df %>%
group_by(Letter) %>%
mutate(Id1 = n_distinct(Letter, Number),
Id2 = n_distinct(Letter, Fruit))
# Gets Following Error: Column `Id1` must be length 3 (the group size) or one, not 2
sample_df1 <- sample_df %>%
group_by(Letter) %>%
mutate(Id1 = 1:n_distinct(Letter, Number),
Id2 = 1:n_distinct(Letter, Fruit))
# NOTE: Manually Created the Index Columns to show next problem
sample_df1 <- sample_df %>%
group_by(Letter) %>%
add_column(Id1 = c(1,2,1,1,2),
Id2 = c(1,2,3,1,2))
Assuming it did need to be done, I manually appended the desired values, and partially solved the problem using developmental tidyr.
# Requires Developmental Tidyr
devtools::install_github("tidyverse/tidyr")
sample_df1 %>%
pivot_wider(names_from = c("Id1", "Id2"), values_from = c("Number", "Fruit")) %>%
set_names(~ str_replace_all(.,"(\\w+.*)(_\\d)(_\\d)", "\\1\\3"))
# Letter Number_1 Number_2 Number_3 Fruit_1 Fruit_2 Fruit_3
#<fct> <dbl> <dbl> <dbl> <fct> <fct> <fct>
# a 1 2 1 Apple Plum Peach
# b 3 4 NA Pear Peach NA
However, this approach still created an unwanted Number_3 column. Using any tidyr, data.table or any other package, is there any way of getting the data in the desired format without duplicating columns?
An option would be to replace the duplicated elements by 'Letter' to NA and then in the reshaped data, remove the columns that are all NA
library(data.table)
out <- dcast(setDT(sample_df)[, lapply(.SD, function(x)
replace(x, duplicated(x), NA)), Letter], Letter ~ rowid(Letter),
value.var = c("Number", "Fruit"))
nm1 <- out[, names(which(!colSums(!is.na(.SD))))]
out[, (nm1) := NULL][]
# Letter Number_1 Number_2 Fruit_1 Fruit_2 Fruit_3
#1: a 1 2 Apple Plum Peach
#2: b 3 4 Pear Peach <NA>
If we want to use the tidyverse approach, a similar option can be used. Note that pivot_wider is from the dev version of tidyr (tidyr_0.8.3.9000)
library(tidyverse)
sample_df %>%
group_by(Letter) %>%
mutate_at(vars(-group_cols()), ~ replace(., duplicated(.), NA)) %>%
mutate(rn = row_number()) %>%
pivot_wider(
names_from = rn,
values_from = c("Number", "Fruit")) %>%
select_if(~ any(!is.na(.)))
# A tibble: 2 x 6
# Letter Number_1 Number_2 Fruit_1 Fruit_2 Fruit_3
# <fct> <dbl> <dbl> <fct> <fct> <fct>
#1 a 1 2 Apple Plum Peach
#2 b 3 4 Pear Peach <NA>

Resources