Convert column to comma separated in R - r

I have two columns A and B in excel with large data.we have to consider both columns A and B, I am trying to achieve column C as output. Right now I am doing everything in excel. So I think there may a way to this in R but really don't know how to do it.Any help is appreciated..Thanks
I have
Column A ColumnB Column C(output column)
A1 10 A2
A2 10 A1
B1 3 B2,B3,B4
B2 3 B1,B3,B4
B3 3 B1,B2,B4
B4 3 B1,B2,B3
C1 6 C2,C3
C2 6 C1,C3
C3 6 C1,C2

We can group by column B then find a set difference between the current column A character and the whole characters in the group:
library(tidyverse)
df %>%
group_by(ColumnB) %>%
mutate(ColumnC=map_chr(ColumnA, ~toString(setdiff(ColumnA, .x))))
# A tibble: 9 x 3
# Groups: ColumnB [3]
ColumnA ColumnB ColumnC
<fct> <int> <chr>
1 A1 10 A2
2 A2 10 A1
3 B1 3 B2, B3, B4
4 B2 3 B1, B3, B4
5 B3 3 B1, B2, B4
6 B4 3 B1, B2, B3
7 C1 6 C2, C3
8 C2 6 C1, C3
9 C3 6 C1, C2

I don't think the question is phrased very clearly but I am interpreting the desired results to be that you want Column C to have all the values from each group of Column B, leaving out the value of Column A. You can do this as follows:
nest Column A and join it back onto the original data frame
flatten it so you now have a vector of the Column A values
use setdiff to get the values that are not Column A
collapse into comma separated string with str_c
You can see that your desired Column C is reproduced.
library(tidyverse)
tbl <- structure(list(ColumnA = c("A1", "A2", "B1", "B2", "B3", "B4", "C1", "C2", "C3"), ColumnB = c(10L, 10L, 3L, 3L, 3L, 3L, 6L, 6L, 6L), ColumnC = c("A2", "A1", "B2,B3,B4", "B1,B3,B4", "B1,B2,B4", "B1,B2,B3", "C2,C3", "C1,C3", "C1,C2")), problems = structure(list(row = 9L, col = "ColumnC", expected = "", actual = "embedded null", file = "literal data"), row.names = c(NA, -1L), class = c("tbl_df", "tbl", "data.frame")), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"), spec = structure(list(cols = list(ColumnA = structure(list(), class = c("collector_character", "collector")), ColumnB = structure(list(), class = c("collector_integer", "collector")), ColumnC = structure(list(), class = c("collector_character", "collector"))), default = structure(list(), class = c("collector_guess", "collector"))), class = "col_spec"))
tbl %>%
left_join(
tbl %>% select(-ColumnC) %>% nest(ColumnA)
) %>%
mutate(
data = flatten(data),
output = map2(data, ColumnA, ~ setdiff(.x, .y)),
output = map_chr(output, ~ str_c(., collapse = ","))
)
#> Joining, by = "ColumnB"
#> # A tibble: 9 x 5
#> ColumnA ColumnB ColumnC data output
#> <chr> <int> <chr> <list> <chr>
#> 1 A1 10 A2 <chr [2]> A2
#> 2 A2 10 A1 <chr [2]> A1
#> 3 B1 3 B2,B3,B4 <chr [4]> B2,B3,B4
#> 4 B2 3 B1,B3,B4 <chr [4]> B1,B3,B4
#> 5 B3 3 B1,B2,B4 <chr [4]> B1,B2,B4
#> 6 B4 3 B1,B2,B3 <chr [4]> B1,B2,B3
#> 7 C1 6 C2,C3 <chr [3]> C2,C3
#> 8 C2 6 C1,C3 <chr [3]> C1,C3
#> 9 C3 6 C1,C2 <chr [3]> C1,C2
Created on 2018-08-21 by the reprex package (v0.2.0).

My understanding is to find all OTHER entries of column A that share the current value of column B
Grouping by B, and finding all A's associated with the value should do the trick (some clean-up afterward removes the current entry of A from the resulting column C)
a <- c("a1", "a2","b1", "b2","b3", "b4","c1","c2","c3","d1")
b <- c(10,10,3,3,3,3,6,6,6,5)
dta <- data.frame(a,b, stringsAsFactors = F)
dta<-dta %>%
group_by(b) %>%
mutate(c = paste0(a,collapse = ",")) %>%
ungroup() %>%
mutate(c = str_replace(c,pattern = paste0(",",a),replacement = "")) %>%
mutate(c = str_replace(c,pattern = paste0(a,","),replacement = "")) %>%
mutate(c = ifelse(c==a,NA,c))

Another version of tidyverse solution. The separate function is handy to separate an existing column to new columns. By doing this, we can create the Group column to make sure all the operation would be within each group. map2 and map function are ideal to do vectorized operation. dat2 is the final output.
library(tidyverse)
dat2 <- dat %>%
separate(ColumnA, into = c("Group", "Number"), remove = FALSE, convert = TRUE, sep = 1) %>%
group_by(Group) %>%
mutate(List = list(ColumnA)) %>%
mutate(List = map2(List, ColumnA, ~.x[!(.x %in% .y)])) %>%
mutate(ColumnC = map_chr(List, ~str_c(.x, collapse = ","))) %>%
ungroup() %>%
select(starts_with("Column"))
dat2
# # A tibble: 9 x 3
# ColumnA ColumnB ColumnC
# <chr> <int> <chr>
# 1 A1 10 A2
# 2 A2 10 A1
# 3 B1 3 B2,B3,B4
# 4 B2 3 B1,B3,B4
# 5 B3 3 B1,B2,B4
# 6 B4 3 B1,B2,B3
# 7 C1 6 C2,C3
# 8 C2 6 C1,C3
# 9 C3 6 C1,C2
DATA
dat <- read.table(text = "ColumnA ColumnB
A1 10
A2 10
B1 3
B2 3
B3 3
B4 3
C1 6
C2 6
C3 6",
stringsAsFactors = FALSE, header = TRUE)

df = read.table(text = "
ColumnA ColumnB
A1 10
A2 10
B1 3
B2 3
B3 3
B4 3
C1 6
C2 6
C3 6
", header=T, stringsAsFactors=F)
library(tidyverse)
df %>%
group_by(ColumnB) %>% # for each ColumnB value
mutate(vals = list(ColumnA), # create a list of all Column A values for each row
vals = map2(vals, ColumnA, ~.x[.x != .y]), # exclude the value in Column A from that list
vals = map_chr(vals, ~paste0(.x, collapse = ","))) %>% # combine remaining values in the list
ungroup() # forget the grouping
# # A tibble: 9 x 3
# ColumnA ColumnB vals
# <chr> <int> <chr>
# 1 A1 10 A2
# 2 A2 10 A1
# 3 B1 3 B2,B3,B4
# 4 B2 3 B1,B3,B4
# 5 B3 3 B1,B2,B4
# 6 B4 3 B1,B2,B3
# 7 C1 6 C2,C3
# 8 C2 6 C1,C3
# 9 C3 6 C1,C2

Related

How to move elements of a column up to top of dataframe in R

So, I have a dataframe that looks something like this:
A B C
a1 NA NA
a2 NA NA
NA b1 NA
NA NA c1
NA NA c2
I want to make it look like this:
A B C
a1 b1 c1
a2 NA c2
In other words, right now the dataframe is perfectly diagonal. I want to end up with 3 orderly columns and it doesn't matter to me if there are NAs after the last element in each column. How can I do this in R? Tidyverse solution would be best but not exclusively looking for one.
We could loop across the columns, order based on the NA elements and then filter only rows having at least one non-NA
library(dplyr)
df1 %>%
mutate(across(everything(), ~ .x[order(is.na(.x))])) %>%
filter(if_any(everything(), complete.cases))
-output
A B C
1 a1 b1 c1
2 a2 <NA> c2
Or using base R
df1[] <- lapply(df1, \(x) x[order(is.na(x))])
df1[rowSums(!is.na(df1)) > 0,]
A B C
1 a1 b1 c1
2 a2 <NA> c2
data
df1 <- structure(list(A = c("a1", "a2", NA, NA, NA), B = c(NA, NA, "b1",
NA, NA), C = c(NA, NA, NA, "c1", "c2")), class = "data.frame",
row.names = c(NA,
-5L))
Update: Code a little shorter:
df %>%
pivot_longer(everything()) %>%
arrange(name, value) %>%
mutate(x = max(parse_number(value), na.rm = TRUE)) %>%
group_by(name) %>%
slice(1:x[1]) %>%
pivot_wider(names_from = name, values_from = value, values_fn = list) %>%
unnest(cols = c(A, B, C))
Here is an alternative (longer :-)) way:
library(tidyverse)
df %>%
pivot_longer(
everything()
) %>%
drop_na() %>%
group_by(name) %>%
summarise(cur_data()[seq(max(count(df, df$name)$n)),]) %>%
pivot_wider(values_fn = list) %>%
unnest(cols = c(A, B, C)) %>%
filter(!if_all(everything(), ~ is.na(.)))
A B C
<chr> <chr> <chr>
1 a1 b1 c1
2 a2 NA c2
Another possible solution, based on purrr::map_dfc:
library(purrr)
map_dfc(df, ~ {y <- .x[!is.na(.x)]; length(y) <- max(colSums(!is.na(df))); y})
#> # A tibble: 2 × 3
#> A B C
#> <chr> <chr> <chr>
#> 1 a1 b1 c1
#> 2 a2 <NA> c2

Copy all pairwise combinations of dataframe and their combined name in a new dataframe

I was wondering how to generate a new dataframe containing all pairwise combinations of the value and the combined row and column name of another dataframe. To explain as an example I have the following dataframe:
# dataframe with col names a1:a5
df <- data.frame(a1 = c(4, 2, 6, 9, 13),
a2 = c(56, 1, 47, 2, 3),
a3 = c(4, 6, 9, 11, 85),
a4 = c(6, 15, 4, 12, 3),
a5 = c(54, 94, 3, 2, 75))
# and with rownames a1:a5
rownames(df) <- c("a1","a2","a3","a4","a5")
df now looks like this:
a1
a2
a3
a4
a5
a1
4
56
4
6
54
a2
2
1
6
15
94
a3
6
47
9
4
3
a4
9
2
11
12
2
a5
13
3
85
3
75
I need a new dataframe of all possible combinations (so 25x2) looking like this:
Step
Value
1
a1a1
4
2
a1a2
56
3
a1a3
4
4
a1a4
6
...
...
...
25
a5a5
75
Thank you.
You could convert the data to a table and back to a data.frame.
df2 <- as.data.frame(as.table(as.matrix(df)))
df2[order(df2$Var1), ]
# Var1 Var2 Freq
# 1 a1 a1 4
# 6 a1 a2 56
# 11 a1 a3 4
# 16 a1 a4 6
# 21 a1 a5 54
# 2 a2 a1 2
# 7 a2 a2 1
# 12 a2 a3 6
# 17 a2 a4 15
# 22 a2 a5 94
# ...
You can put it in a long format:
library(tidyr)
library(dplyr)
df %>%
# add as column row names
mutate(col1 = rownames(.)) %>%
# from wide to long format
pivot_longer( -col1, values_to = "Value", names_to = "col2") %>%
# create the combination in the format you need
mutate(step = paste0(col1,col2)) %>%
# select useful columns
select(step, Value) %>%
# sort by step
arrange(step)
# A tibble: 25 x 2
step Value
<chr> <dbl>
1 a1a1 4
2 a1a2 56
3 a1a3 4
4 a1a4 6
5 a1a5 54
6 a2a1 2
7 a2a2 1
8 a2a3 6
9 a2a4 15
10 a2a5 94
# ... with 15 more rows
We may use stack.
stack(dat)
# values ind
# 1 4 a1
# 2 2 a1
# 3 6 a1
# 4 9 a1
# 5 13 a1
# 6 56 a2
# 7 ...
Or, to be precise:
cbind(stack(as.data.frame(t(dat))), r=rownames(dat)) |>
transform(step=paste0(ind, r)) |> subset(select=c(4, 1))
# step values
# 1 a1a1 4
# 2 a1a2 56
# 3 a1a3 4
# 4 a1a4 6
# 5 a1a5 54
# 6 a2a1 2
# 7 ...
Data:
dat <- structure(list(a1 = c(4L, 2L, 6L, 9L, 13L), a2 = c(56L, 1L, 47L,
2L, 3L), a3 = c(4L, 6L, 9L, 11L, 85L), a4 = c(6L, 15L, 4L, 12L,
3L), a5 = c(54L, 94L, 3L, 2L, 75L)), class = "data.frame", row.names = c("a1",
"a2", "a3", "a4", "a5"))

How to check if pairs remain the same between years?

I have a data frame with pairs of individual birds (male and female) that were observed in several years. I am trying to figure out whether these pairs have changed from one year to the next so that I can do some further analyses.
My data is structured like this:
dat <- tibble(year = rep(1:3, each = 3),
Male = c("A1", "B1", "C1",
"A1", "B1", "C1",
"A1", "B1", "C2"),
Female = c("X1", "Y1", "Z1",
"X1", "Y2", "Z2",
"X1", "Y2", "Z2"))
# A tibble: 9 x 3
year Male Female
<int> <chr> <chr>
1 1 A1 X1
2 1 B1 Y1
3 1 C1 Z1
4 2 A1 X1
5 2 B1 Y2
6 2 C1 Z2
7 3 A1 X1
8 3 B1 Y2
9 3 C2 Z2
And my expected output is something like:
# A tibble: 9 x 5
year Male Female male_state female_state
<int> <chr> <chr> <chr> <chr>
1 1 A1 X1 new new
2 1 B1 Y1 new new
3 1 C1 Z1 new new
4 2 A1 X1 reunited reunited
5 2 B1 Y2 divorced new
6 2 C1 Z2 divorced new
7 3 A1 X1 reunited reunited
8 3 B1 Y2 reunited reunited
9 3 C2 Z2 new divorced
I cannot figure out how to check whether a value from a different column is the same in the year before (e.g. if the male ID is the same for a certain female in year 2 or 3 as in the year prior). Any ideas?
This (probably overcomplicated) pipe produces the following output.
dat <- tibble(year = rep(1:3, each = 3),
Male = c("A1", "B1", "C1",
"A1", "B1", "C1",
"A1", "B1", "C2"),
Female = c("X1", "Y1", "Z1",
"X1", "Y2", "Z2",
"X1", "Y2", "Z2"))
dat %>%
mutate(pair=paste0(Male,Female)) %>%
arrange(pair,year) %>%
mutate(check = if_else((pair==lag(pair)) & (year>lag(year)), 'old couple', 'new couple')) %>%
mutate(check = if_else(is.na(check), 'new couple', check)) %>%
mutate(divorced = if_else((Male == lag(Male)) & (Female != lag(Female)), 'divorce', '')) %>%
mutate(divorced = if_else(is.na(divorced), '', divorced))
OUTPUT:
# A tibble: 9 × 6
year Male Female pair check divorced
<int> <chr> <chr> <chr> <chr> <chr>
1 1 A1 X1 A1X1 new couple ""
2 2 A1 X1 A1X1 old couple ""
3 3 A1 X1 A1X1 old couple ""
4 1 B1 Y1 B1Y1 new couple ""
5 2 B1 Y2 B1Y2 new couple "divorce"
6 3 B1 Y2 B1Y2 old couple ""
7 1 C1 Z1 C1Z1 new couple ""
8 2 C1 Z2 C1Z2 new couple "divorce"
9 3 C2 Z2 C2Z2 new couple ""
Try this:
library(tidyverse)
dat <- tibble(
year = rep(1:3, each = 3),
Male = c(
"A1", "B1", "C1",
"A1", "B1", "C1",
"A1", "B1", "C2"
),
Female = c(
"X1", "Y1", "Z1",
"X1", "Y2", "Z2",
"X1", "Y2", "Z2"
)
)
dat |>
mutate(pairing = str_c(Male, "|", Female)) |>
add_count(pairing) |>
group_by(pairing) |>
mutate(male_state = if_else(pairing == lag(pairing), "reunited", NA_character_),
female_state = if_else(pairing == lag(pairing), "reunited", NA_character_)) |>
group_by(Male) |>
mutate(
male_state = if_else(row_number() == 1, "new", male_state),
male_state = if_else(is.na(male_state), "divorced", male_state)
) |>
group_by(Female) |>
mutate(
female_state = if_else(row_number() == 1, "new", female_state),
female_state = if_else(is.na(female_state), "divorced", female_state)
) |>
arrange(year, Male)
#> # A tibble: 9 × 7
#> # Groups: Female [5]
#> year Male Female pairing n male_state female_state
#> <int> <chr> <chr> <chr> <int> <chr> <chr>
#> 1 1 A1 X1 A1|X1 3 new new
#> 2 1 B1 Y1 B1|Y1 1 new new
#> 3 1 C1 Z1 C1|Z1 1 new new
#> 4 2 A1 X1 A1|X1 3 reunited reunited
#> 5 2 B1 Y2 B1|Y2 2 divorced new
#> 6 2 C1 Z2 C1|Z2 1 divorced new
#> 7 3 A1 X1 A1|X1 3 reunited reunited
#> 8 3 B1 Y2 B1|Y2 2 reunited reunited
#> 9 3 C2 Z2 C2|Z2 1 new divorced
Created on 2022-05-03 by the reprex package (v2.0.1)

Adding a pvalue column to dataframe in R

I have a dataframe that looks like this:
A1 A2 A3 B1 B2 B3
0 1 0 2 3 3
5 6 4 4 6 6
I would like to add a column based on t-testing the significance of the difference between As and Bs:
A1 A2 A3 B1 B2 B3 PValue
0 1 0 3 3 4 <some small number>
5 6 4 4 6 6 <some large number>
I tried using dplyr like this:
data %>%
mutate(PValue = t.test(unlist(c(A1,A2,A3),unlist(c(B1,B2,B3)))$p.value)
However, the resulting PValue column is constant for some reason. I would appreciate any help.
If we are doing this by row, then pmap is one way
library(tidyverse)
pmap_dbl(data, ~ c(...) %>%
{t.test(.[1:3], .[4:6])$p.value}) %>%
bind_cols(data, PValue = .)
# A1 A2 A3 B1 B2 B3 PValue
#1 0 1 0 2 3 3 0.007762603
#2 5 6 4 4 6 6 0.725030185
or another option is rowwise with do
data %>%
rowwise() %>%
do(data.frame(., PValue = t.test(unlist(.[1:3]), unlist(.[4:6]))$p.value))
# A tibble: 2 x 7
# A1 A2 A3 B1 B2 B3 PValue
#* <int> <int> <int> <int> <int> <int> <dbl>
#1 0 1 0 2 3 3 0.00776
#2 5 6 4 4 6 6 0.725
Or we can gather to 'long' format and then do the group by t.test
data %>%
rownames_to_column('rn') %>%
gather(key, val, -rn) %>% group_by(rn) %>%
summarise(PValue = t.test(val[str_detect(key, "A")],
val[str_detect(key, "B")])$p.value) %>%
pull(PValue) %>%
bind_cols(data, PValue = .)
data
data <- structure(list(A1 = c(0L, 5L), A2 = c(1L, 6L), A3 = c(0L, 4L),
B1 = c(2L, 4L), B2 = c(3L, 6L), B3 = c(3L, 6L)), .Names = c("A1",
"A2", "A3", "B1", "B2", "B3"), class = "data.frame", row.names = c(NA,
-2L))
Also with apply in Base R:
data$PValue = apply(data, 1, function(x) t.test(x[1:3], x[4:6])$p.value)
or:
library(dplyr)
data %>%
mutate(PValue = apply(., 1, function(x) t.test(x[1:3], x[4:6])$p.value))
Result:
A1 A2 A3 B1 B2 B3 PValue
1 0 1 0 2 3 3 0.007762603
2 5 6 4 4 6 6 0.725030185

Joining two dataframes by concatenating columns

I have two dataframes with the same structure - both have two ID columns and 25 string data columns. I want to join the two and concatenate the strings in the data columns when the IDs match. So, for example:
df_1:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A
df_2:
id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B
Combined, this should give
df_combined:
id_1 id_2 col_1 col2 ... col_25
a1 b1 A, B A ... <NA>
a1 b2 A B ... A, B
a1 b3 B <NA> ... B
a2 b1 <NA> <NA> ... A
When I try to use join or merge, it repeats everything except the ID columns (so I end up with 50 data columns). Do I need to use something else?
Thanks!
You can do this if you don't have any empty string :
library(dplyr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
`[<-`(.=="",value=NA)
with magrittr you can avoid the not so pretty '[<-' and replace it by inset
library(magrittr)
bind_rows(df_1,df_2) %>%
group_by(id_1,id_2) %>%
summarize_all(~ paste(na.omit(.x),collapse=", ")) %>%
inset(.=="",value=NA)
There is an alternative solution using melt() and dcast() to reshape the data:
library(data.table)
rbind(setDT(df_1), setDT(df_2))[
, melt(.SD, measure.var = patterns("col"), na.rm = TRUE)][
, dcast(.SD, id_1 + id_2 ~ variable, toString, fill = NA)]
id_1 id_2 col_1 col2 col_25
1: a1 b1 A, B A NA
2: a1 b2 A B A, B
3: a1 b3 B NA B
4: a2 b1 NA NA A
Data
df_1 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 A A ... <NA>
a1 b2 A <NA> ... A
a2 b1 <NA> <NA> ... A",
drop = 5L, na.strings = "<NA>"
)
df_2 <- fread(
"id_1 id_2 col_1 col2 ... col_25
a1 b1 B <NA> ... <NA>
a1 b2 <NA> B ... B
a1 b3 B <NA> ... B",
drop = 5L, na.strings = "<NA>"
)
To elaborate to the idea commented by #zx8754, and using dplyr package,
library(dplyr)
df1 %>%
bind_rows(df2) %>%
mutate_at(vars(-contains('id')), funs(replace(., is.na(.), ''))) %>%
group_by(id_1, id_2) %>%
summarise_all(funs(trimws(paste(., collapse = ' ')))) %>%
mutate_all(funs(replace(., . == '', NA)))
which gives,
# A tibble: 4 x 5
# Groups: id_1 [2]
id_1 id_2 col_1 col2 col_25
<chr> <chr> <chr> <chr> <chr>
1 a1 b1 A B A <NA>
2 a1 b2 A B A B
3 a1 b3 B <NA> B
4 a2 b1 <NA> <NA> A
NOTE:
Above script assumes that your NAs are actual NA (not characters)
Your variables are as.character
DATA
dput(df1)
structure(list(id_1 = c("a1", "a1", "a2"), id_2 = c("b1", "b2",
"b1"), col_1 = c("A", "A", NA), col2 = c("A", NA, NA), col_25 = c(NA,
"A", "A")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")
> dput(df2)
structure(list(id_1 = c("a1", "a1", "a1"), id_2 = c("b1", "b2",
"b3"), col_1 = c("B", NA, "B"), col2 = c(NA, "B", NA), col_25 = c(NA,
"B", "B")), .Names = c("id_1", "id_2", "col_1", "col2", "col_25"
), row.names = c(NA, -3L), class = "data.frame")

Resources