Merge three Variables to one and replicate observations

Merge three Variables to one and replicate observations - r

I have a Dataframe which looks like the following:
B <- data.frame(
nr=c(1,2,3,4,5),
A=c('a','b','c','d','e'),
B=c("s", "t", "i", "u", "z"),
B1=c("", "v", "", "", ""),
B2 =c("", "g", "", "", ""))
B <- B %>% mutate_all(na_if,"")
Since my Varaibales B1 and B2 only have one value, I would like to merge B1 and B2 to the Variable B. Therefor it should create two new observation and replicating every other Variable of this Oberservation.
It should look like the following:
B <- data.frame(
nr=c(1,2,2, 2, 3,4,5),
A=c("a","b", "b", "b", "c","d","e"),
B=c("s", "v", "g", "t", "i", "u", "z"))
Thanks for your help!!

Reshape to 'long' format with pivot_longer on the 'B' columns and remove the NA with values_drop_na = TRUE
library(dplyr)
library(tidyr)
B %>%
pivot_longer(cols = starts_with("B"), values_to = "B",
values_drop_na = TRUE, names_to = NULL)
-output
# A tibble: 7 × 3
nr A B
<dbl> <chr> <chr>
1 1 a s
2 2 b t
3 2 b v
4 2 b g
5 3 c i
6 4 d u
7 5 e z

Related

R - cleaning data with repeated columns for different locations

#Edited to make my data more similar to the data I'm working with and example of what I have tried
I am working with a Qualtrics survey where blocks of questions repeat themselves based on previous questions using a function in the survey build called "loop and merge". I'm trying to pull out like questions and then use rbind so that each question only shows up once in a column. I have a basic example below, however in my actual data, the repeats happen 36 times.
example data frame:
capacity_1 <- data.frame("1_q1" = 1:4,
"1_q2" = c("a", "b", "c", "d"),
'1_q3' = 10:13,
'1_q4' = 100:103,
'1_q5' = 110:113,
'1_q6' = 11:14,
"2_q1" = 22:25,
"2_q2" = c("i", "j", "k", "l"),
'2_q3' = 20:23,
'2_q4' = 200:203,
'2_q5' = 210:213,
'2_q6' = 21:24,
"3_q1" = 90:93,
"3_q2" = c("p", "q", "r", "s"),
'3_q3' = 10:13,
'3_q4' = 300:303,
'3_q5' = 310:313,
'3_q6' = 31:34,check.names = FALSE)
note that the "1_" at the start of "1_q1" is the county's reference number
What I could do but that is inefficient, especially since my actual data repeats these questions 36 times:
dat_1 <- dat %>%
select(1:2) %>%
rename(q = 1:2) %>%
mutate("county" = 1)
dat_2 <- dat %>%
select(3:4) %>%
rename(q = 1:2) %>%
mutate("county" = 2)
dat_3 <- dat %>%
select(5:6) %>%
rename(q = 1:2)%>%
mutate("county" = 3)
dat_final <- rbind(dat_1, dat_2, dat_3)
the "dat_final" data frame is what I'd like the data to look like, but also have formatted again here:
dat_clean <- data.frame("q1" = c(1:4, 22:25, 90:93),
"q2" = c("a", "b", "c", "d",
"i", "j", "k", "l",
"p", "q", "r", "s"),
"county" = c(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3))
Update - Tried suggestion below, and get the error "error in "set_names()" the size of 'nm' (6) must be compatible with the size of 'x'(2)
do.call(
rbind,
lapply(seq(1,ncol(capacity_1),6), \(i) {
capacity_1 %>%
select(c(i,i+5)) %>%
rename_all(~c("capacity_outpatient", "capacity_inpatient", "capacity_housing",
"capacity_recovery", "capacity_demand", "capacity_notes")) %>%
mutate(county=(i+5)/6)
})
)

You can do the following, which uses a seq from 1 to ncol(dat), by 2:
do.call(
rbind,
lapply(seq(1,ncol(dat),2), \(i) {
dat %>% select(c(i,i+1)) %>% rename_all(~c("q1","q2")) %>% mutate(county=(i+1)/2)
})
)
Output:
q1 q2 county
1 1 a 1
2 2 b 1
3 3 c 1
4 4 d 1
5 22 i 2
6 23 j 2
7 24 k 2
8 25 l 2
9 90 p 3
10 91 q 3
11 92 r 3
12 93 s 3
Another approach, with data.table
library(data.table)
setDT(dat)
rbindlist(lapply(seq(1,ncol(dat),2), \(i) {
setnames(dat[,i:(i+1)],c("q1","q2"))
}), use.names=F,idcol = "county")
Output:
county q1 q2
1: 1 1 a
2: 1 2 b
3: 1 3 c
4: 1 4 d
5: 2 22 i
6: 2 23 j
7: 2 24 k
8: 2 25 l
9: 3 90 p
10: 3 91 q
11: 3 92 r
12: 3 93 s

A solution using dplyr, purrr, stringr - This solution is not affected by columns orders, number of q columns. It just use the perfix as base for processing data.
library(dplyr)
library(purrr)
library(stringr)
dat <- data.frame("1_q1" = 1:4,
"1_q2" = c("a", "b", "c", "d"),
"2_q1" = 22:25,
"2_q2" = c("i", "j", "k", "l"),
"3_q1" = 90:93,
"3_q2" = c("p", "q", "r", "s"), check.names = FALSE)
# Here is the indexes of county that want to extract from df
county_index <- c("1", "2", "3")
# Function that take index as input and will extract data from `dat` df
edit_df <- function(index) {
dat %>%
# select column start with index prefix
select(matches(paste0(index, "_"))) %>%
# remove the index prefix from string
rename_all(~ str_replace(., regex("^\\d+_", ignore_case = TRUE), "")) %>%
# add county column with the input inex
mutate("county" = as.numeric(index))
}
Result using purrr::map_dfr
# map the county index that want to extract from original df and edit_df function
dat_clean <- map_dfr(.x = county_index, .f = edit_df)
dat_clean
#> q1 q2 county
#> 1 1 a 1
#> 2 2 b 1
#> 3 3 c 1
#> 4 4 d 1
#> 5 22 i 2
#> 6 23 j 2
#> 7 24 k 2
#> 8 25 l 2
#> 9 90 p 3
#> 10 91 q 3
#> 11 92 r 3
#> 12 93 s 3
Created on 2022-05-25 by the reprex package (v2.0.1)

Join data but ignore missing values

I am having some trouble with joining data frames with dplyr, where I would like to ignore the NAs.
The data that I have is quite big, but a simplified version looks like:
id <- c("id1", "id2", "id3", "id4")
A <- c("E", "F", "G", NA)
B <- c("T", NA, "N", "T")
C <- c(NA, "T", "U", NA)
df <- data.frame(A, B, C)
id A B C
1 id1 E T NA
2 id2 F NA T
3 id3 G N U
4 id4 NA T NA
I have an entry that I would like to match with df, which is e.g.:
df2 <- data.frame(A = "E", B = "T", C = "M")
A B C
1 E T M
As a result I would like to obtain all rows from df that match with df2, but the NAs should be ignored. So the result should look like this:
id A B C
1 id1 E T NA
2 id4 NA T NA
I was trying to do this with semi_join, but it did not work so far:
result <- df %>%
group_by(n = seq(n())) %>%
do(modify_if(., is.na, ~NULL) %>%
semi_join(df2, by = c("A", "B", "C"))) %>%
ungroup %>%
select(-n)
Which results in:
Error: `by` can't contain join column `C` which is missing from LHS
Call `rlang::last_error()` to see a backtrace
Who knows the answer?

Here's a solution with a mix of tidyverse and base R. I think this is pretty clear, but I'd be interested in a pure tidyverse implementation that isn't completely contrived.
The idea is to first expand all entries in df and df2 and then filter through all the columns using a loop.
The data:
id <- c("id1", "id2", "id3", "id4")
A <- c("E", "F", "G", NA)
B <- c("T", NA, "N", "T")
C <- c(NA, "T", "U", NA)
df <- data.frame(id, A, B, C, stringsAsFactors = F) # Make sure to use strings not factors
df2 <- data.frame(A = "E", B = "T", C = "M", stringsAsFactors = F)
Code:
library(tidyr)
results <- crossing(df, df2)
select_columns <- c("A", "B", "C")
for(col in select_columns) {
keep <- is.na(results[[col]]) | results[[col]] == results[[paste0(col, 1)]]
results <- results[keep,, drop=F]
}
results <- results %>% dplyr::select(id, A:C) %>% distinct
results
id A B C
1 id1 E T <NA>
2 id4 <NA> T <NA>

If you only need to do this for a single set of values this is probably the most straightforward approach:
d[A %in% c("E",NA) & B %in%c("T",NA) & C %in% c("M",NA),]

Another example using tidyverse and base (dplyr, tidyr, base):
In this I convert your df2 into a dataframe that includes all combinations of values you want to accept ( (E or NA) & (T or NA) & (M or NA) ) and then I do an inner join with this full set. There are other ways to create a dataframe of all possible combinations but this one uses tidyr fairly easily.
library(dplyr)
library(tidyr)
id <- c("id1", "id2", "id3", "id4")
A <- c("E", "F", "G", NA)
B <- c("T", NA, "N", "T")
C <- c(NA, "T", "U", NA)
df <- data.frame(A, B, C, stringsAsFactors = FALSE)
df2 <- data.frame(A = "E", B = "T", C = "M",stringsAsFactors = FALSE)
df2_expanded <- df2 %>%
rowwise() %>%
mutate(combinations = list(expand.grid(A = c(A,NA),B = c(B,NA),C = c(C,NA),stringsAsFactors = FALSE))) %>%
select(-A,-B,-C) %>%
unnest(combinations)
# A tibble: 8 x 3
# A B C
# <chr> <chr> <chr>
# 1 E T M
# 2 NA T M
# 3 E NA M
# 4 NA NA M
# 5 E T NA
# 6 NA T NA
# 7 E NA NA
# 8 NA NA NA
df %>%
inner_join(df2_expanded)
# A B C
# 1 E T <NA>
# 2 <NA> T <NA>

Filter directed co-occurrences in a table

I have co-occurrence data that can be represented in two columns. The entries in each column are from the same set of possibilities. Ultimately I am aiming to plot a directed network but first I would like to split the table into those that reciprocal (i.e. both X->Y and Y->X) and those that occur in just one direction (i.e. only Y->Z). Here is an example:
library(tidyverse)
# Example data
from <- c("A", "B", "F", "Q", "T", "S", "D", "E", "A", "T", "F")
to <- c("E", "D", "Q", "S", "F", "T", "B", "A", "D", "A", "E")
df <- data_frame(from, to)
df
# A tibble: 11 x 2
from to
<chr> <chr>
1 A E
2 B D
3 F Q
4 Q S
5 T F
6 S T
7 D B
8 E A
9 A D
10 T A
11 F E
and here is my desired output:
# Desired output 1 - reciprocal co-occurrences
df %>%
slice(c(1,2)) %>%
rename(item1 = from, item2 = to)
# A tibble: 2 x 2
item1 item2
<chr> <chr>
1 A E
2 B D
# Desired output 2 - single occurrences
df %>%
slice(c(3,4,6,6,9,10,11))
# A tibble: 7 x 2
from to
<chr> <chr>
1 F Q
2 Q S
3 S T
4 S T
5 A D
6 T A
7 F E
If the co-occurrences are reciprocal it does not matter what order the entries are in I only need their names co-occurrences are not I need to know the direction.
This feels like a graph problem so I have had a go but am unfamiliar with working with this type of data and most tutorials seem to cover undirected graphs. Looking at the tidygraph package which I understand uses the igraph package I have tried this:
library(tidygraph)
df %>%
as_tbl_graph(directed = TRUE) %>%
activate(edges) %>%
mutate(recip_occur = edge_is_mutual()) %>%
as_tibble() %>%
filter(recip_occur == TRUE)
# A tibble: 4 x 3
from to recip_occur
<int> <int> <lgl>
1 1 8 TRUE
2 2 7 TRUE
3 7 2 TRUE
4 8 1 TRUE
However this divorces the edges from the nodes and repeats reciprocal co-occurrences. Does anyone have experience with this sort of data?

try this:
data:
from <- c("A", "B", "F", "Q", "T", "S", "D", "E", "A", "T", "F")
to <- c("E", "D", "Q", "S", "F", "T", "B", "A", "D", "A", "E")
df <- data_frame(from, to)
code:
recursive_IND <-
1:nrow(df) %>%
sapply(function(x){
if(any((as.character(df[x,]) == t(df[,c(2,1)])) %>% {.[1,] & .[2,]}))
return(T) else return(F)
})
df[recursive_IND,][!(df[recursive_IND,] %>% apply(1,sort) %>% t %>% duplicated(.)),]
df[!recursive_IND,]
result:
# A tibble: 2 x 2
# from to
# <chr> <chr>
#1 A E
#2 B D
# A tibble: 7 x 2
# from to
# <chr> <chr>
#1 F Q
#2 Q S
#3 T F
#4 S T
#5 A D
#6 T A
#7 F E

stringr: replace string by supplying pattern through a character vector

Here is my data:
df <- tibble::tribble(
~A, ~B,
"C", "G",
"D", "H",
"E", "I",
"F", "J")
value1 <- "D"
value2 <- "C"
And, in variable A, I want to replace D and C with "m" and "n", something like this, but it's not working!
df %>% mutate(X = A %>% str_replace_all(c(value1 = "m", value2 = "n")))
My desired output is:
df %>% mutate(X = A %>% str_replace_all(c("D" = "m", "C" = "n")))
But instead of supplying "D" and "C" manually, I want to programmatically supply these, something in line with...using value1 and value2.
How should I do that?

You could try using setNames to set the names of m and n like:
library(dplyr)
library(stringr)
df %>% mutate(X = A %>% str_replace_all(setNames(c("m","n"), c(value1, value2))))
# A tibble: 4 x 3
# A B X
# <chr> <chr> <chr>
#1 C G n
#2 D H m
#3 E I E
#4 F J F
And then checking that it's equal to your desired result:
identical(
df %>% mutate(X = A %>% str_replace_all(c("D" = "m", "C" = "n"))),
df %>% mutate(X = A %>% str_replace_all(setNames(c("m","n"), c(value1, value2)))))
#[1] TRUE
I also included the other packages you use: dplyr and stringr

You can think of creating a named vector and use it as replacement vector.
replacementVector <- c("m","n")
names(replacementVector) <- c("D","C")
Now, use the replacementVector in dplyr chain along with ifelse as:
df %>% mutate(X = ifelse(is.na(replacementVector[A]), A, replacementVector[A]))
# # A tibble: 4 x 3
# A B X
# <chr> <chr> <chr>
# 1 C G n
# 2 D H m
# 3 E I E
# 4 F J F
Data:
library(tidyverse)
df <- tibble::tribble(
~A, ~B,
"C", "G",
"D", "H",
"E", "I",
"F", "J")

As is vectorized over string and replacement if you put all the values in the same vector you can just run
df %>% mutate(X = A %>% str_replace_all(c("C","D"), c("m","n")))

We could use chartr
df %>%
mutate(X = chartr('DC', 'mn', A))
# A tibble: 4 x 3
# A B X
# <chr> <chr> <chr>
#1 C G n
#2 D H m
#3 E I E
#4 F J F

Loop to Replace Matching Values

I'm looking for an easy and elegant way to accomplish this.
So if I have dataset x and relationship is A -> B -> Z -> Y and D -> H -> G, I would like to create dataset y. Unfortunately, they are not necessarily in order:
> x <- data.frame(
+ from = as.character(c("A", "E", "B", "D", "H", "Z")),
+ to = as.character(c("B", "E", "Z", "H", "G", "Y")))
>
> y <- data.frame(
+ from = as.character(c("A", "E", "B", "D", "H", "Z")),
+ to = as.character(c("Y", "E", "Y", "G", "G", "Y")))
>
> x
from to
1 A B
2 E E
3 B Z
4 D H
5 H G
6 Z Y
> y
from to
1 A Y
2 E E
3 B Y
4 D G
5 H G
6 Z Y
I have a fairly large dataset (currently 500k rows; will grow in the future) and actually care about the performance; I'm not sure if there are any other ways to do this without a for-loop or even to vectorize/parallelize the process.
I'm thinking about splitting and removing all rows where from == to or creating an indicator to skip certain rows so the loop does not have to go through the entire dataset each time.
I'd also like to know what the breakpoint should be if I do create a loop; I'm not sure how to define when the loop should stop.
Any suggestions would be appreciated. Thanks!

We can use dplyr to create a grouping variable by comparing the adjacent elements of 'to' and 'from' and change the values in 'to' the last element of 'to'
library(dplyr)
x %>%
group_by(grp = cumsum(lag(lead(from, default = last(from)) !=
as.character(to), default = TRUE))) %>%
mutate(to = last(to)) %>%
ungroup %>%
select(-grp)
# A tibble: 4 x 2
# from to
# <fctr> <fctr>
#1 A D
#2 B D
#3 C D
#4 E E

Another solution can be achieved using lag from dplyr and fill from tidyr as:
library(tidyverse)
x %>% arrange(from) %>%
mutate(samegroup = ifelse(from == lag(to), 1, 0)) %>%
mutate(group = ifelse(samegroup == 0 | is.na(samegroup), row_number(), NA)) %>%
fill(group) %>%
group_by(group) %>%
mutate(to = last(to)) %>%
ungroup() %>%
select(-samegroup, - group)
# A tibble: 6 x 2
# from to
# <chr> <chr>
#1 A D
#2 B D
#3 C D
#4 E E
#5 F H
#6 G H
Data used
x <- data.frame(from = as.character(c("A", "B", "F", "C", "G", "E")),
to = as.character(c("B", "C", "G", "D", "H", "E")),
stringsAsFactors = FALSE)

Develop Reference

r css asp.net wordpress firebase qt symfony nginx http apache-flex

Merge three Variables to one and replicate observations - r

Related

R - cleaning data with repeated columns for different locations

Join data but ignore missing values

Filter directed co-occurrences in a table

stringr: replace string by supplying pattern through a character vector

Loop to Replace Matching Values

Categories

Resources