Replace values in entire data frame with column values in R - r

I did some search but cannot find an obvious answer of this question so hopefully it's not a duplicated question. I have a data frame looks like this:
X1 X2 V1 V2 V3 ... Vn
A B 0 1 2 1
B C 1 0 1 0
A C 2 1 0 1
What I want to achieve is to replace V1 to Vn values to the "dosage" of X2. So for row 1 (each row may have different values of X1 and X2),
if the value is 0, I want to replace it to AA;
if the value is 1, I want to replace it to AB;
if the value is 2, I want to replace it to BB;
The expected outcome is:
X1 X2 V1 V2 V3 ... Vn
A B AA AB BB AB
B C BC BB BC BB
A C CC AC AA AC
Here is the sample data frame:
df=data.frame(X1=c("A","B","A"),
X2=c("B","C","C"),
V1=c(0,1,2),
V2=c(1,0,1),
V3=c(2,1,0))
Thanks for the help!

This is inspired from #Matt's answer. We can use mutate_at with paste0 to achieve this task.
## Load packages
library(dplyr)
dat2 <- dat %>%
mutate_at(vars(-X1, -X2), .funs = list(
~case_when(
. == 0 ~paste0(X1, X1),
. == 1 ~paste0(X1, X2),
. == 2 ~paste0(X2, X2),
TRUE ~NA_character_
)
))
dat2
# X1 X2 V1 V2 V3 Vn
# 1 A B AA AB BB AB
# 2 B C BC BB BC BB
# 3 A C CC AC AA AC
DATA
dat <- read.table(text = "X1 X2 V1 V2 V3 Vn
A B 0 1 2 1
B C 1 0 1 0
A C 2 1 0 1 ",
stringsAsFactors = FALSE, header = TRUE)

With your actual df you can replace V1:V3 with V1:Vn.
Using your reprex, you can do:
library(dplyr)
df %>%
mutate_at(
vars(V1:V3),
funs(case_when(
. == 0 ~ "AA",
. == 1 ~ "AB",
. == 2 ~ "BB"
))
)

It not an elegant solution but for the sake of completeness: just nest two for-loops
for (i in 1:dim(df)[1]) {
for (j in 3:dim(df)[2]){
if (df[i,j] == 0){
df[i,j] <- paste0(df[i,1], df[i,1])
} else if (df[i,j] == 1) {
df[i,j] <- paste0(df[i,1], df[i,2])
} else if (df[i,j] == 2) {
df[i,j] <- paste0(df[i,2], df[i,2])
}
}
}
Sorry for that.

Use spread and gather
library(tidyverse)
df <- tibble(X1=c("A","B","A"),
X2=c("B","C","C"),
V1=c(0,1,2),
V2=c(1,0,1),
V3=c(2,1,0))
Capture your from:to translation
transl <- tibble(DOSE = c(0,1,2),
OUTCOME = c("AA", "AB", "AC"))
Then Gather your values into long form
longTbl <- df %>%
gather(key = "TheV", value = "DOSE", na.rm = TRUE,starts_with("V")) %>%
left_join(transl, by = "DOSE") %>%
select(- DOSE)
# A tibble: 9 x 4
X1 X2 TheV OUTCOME
<chr> <chr> <chr> <chr>
1 A B V1 AA
2 B C V1 AB
3 A C V1 AC
4 A B V2 AB
5 B C V2 AA
6 A C V2 AB
7 A B V3 AC
8 B C V3 AB
9 A C V3 AA
You might be better leaving it there. But we can pivot it back with spread.
widTbl <- longTbl %>%
spread(TheV, OUTCOME )
widTbl
# A tibble: 3 x 5
X1 X2 V1 V2 V3
<chr> <chr> <chr> <chr> <chr>
1 A B AA AB AC
2 A C AC AB AA
3 B C AB AA AB
And Bob's your uncle.

Related

data frame de duplication

I have a data frame. You can see that some rows just differs in the order "A"-"B" and "B"-"A" and these two rows have the same Value
df <- tibble(
V1 = c("A", "C", "B","D"),
V2 = c("B", "D", "A","C"),
Value = c(1,2,1,2)
)
V1 V2 Value
<chr> <chr> <dbl>
1 A B 1
2 C D 2
3 B A 1
4 D C 2
I want to remove one duplicated rows 0 or 2, to make it like below
V1 V2 Value
0 A B 1
1 C D 2
How can I remove those repetitive rows?
df[!duplicated(t(apply(df,1,sort))),]
V1 V2 Value
0 A B 1
1 C D 2
or even:
df[!duplicated(cbind(pmax(df$V1, df$V2), pmin(df$V1, df$V2))),]
V1 V2 Value
0 A B 1
1 C D 2
An option with tidyverse
library(dplyr)
library(stringr)
library(purrr)
df %>%
filter(!duplicated(pmap_chr(across(V1:V2), ~ str_c(sort(c(...)),
collapse = ""))))
# A tibble: 2 × 3
V1 V2 Value
<chr> <chr> <dbl>
1 A B 1
2 C D 2

Replacing rows between 2 dataframes in R

I have 2 data frames, and I want to write a code that will allow me to check if a row from data frame1 exist in data frame2, and if so then I want to replace the row(s) from data frame1 with the row(s) from data frame2. Here is an example:
dataframe1:
name
A
B
AA
1
1
BB
1
0
CC
0
1
dataframe2:
name
A
B
AA
1
2
DD
1
3
EE
4
1
I want to switch rows between both dataframes, and the outcome will be:
dataframe1:
name
A
B
AA
1
2
BB
1
0
CC
0
1
To clarify, I want to row AA from dataframe1 to be switched by the row AA dataframe2.
This is what I tried to do:
df1[which(df1$name %in% df2$name)[1:nrow(df2)],] <- df2
And:
df1$name[match(df2$name,df1$name)] <- df2$name
Both didn't work unfortunately.
Thanks for helping!
Does this work:
df1
name A B
1 AA 1 1
2 BB 1 0
3 CC 0 1
df2
name A B
1 AA 1 2
2 DD 1 3
3 EE 4 1
df2$name %in% df1$name
[1] TRUE FALSE FALSE
df1[df1$name %in% df2$name, ] = df2[df2$name %in% df1$name, ]
df1
name A B
1 AA 1 2
2 BB 1 0
3 CC 0 1
The natural_join is the function you are looking for
library(rqdatatable)
dataframe1 <- data.frame(
name = c('AA', 'BB', 'CC'),
A = c(1,1,0),
B = c(1,0,1)
)
dataframe2 <- data.frame(
name = c('AA', 'DD', 'EE'),
A = c(1,1,4),
B = c(2,3,1)
)
natural_join(dataframe2, dataframe1, by = "name",
jointype = 'RIGHT')
You can make an update join:
i <- match(df1$name, df2$name)
j <- which(!is.na(i))
df1[j,] <- df2[i[j],]
df1
# name A B
#1 AA 1 2
#2 BB 1 0
#3 CC 0 1
Data:
df1 <- data.frame(name = c("AA","BB","CC"), A = c(1,1,0), B = c(1,0,1))
df2 <- data.frame(name = c("AA","DD","EE"), A = c(1,1,4), B = c(2,3,1))
A dplyr way using across, left_join and coalesce
library(dplyr, warn.conflicts = F)
df1 <- data.frame(name = c("AA","BB","CC"), A = c(1,1,0), B = c(1,0,1))
df2 <- data.frame(name = c("AA","DD","EE"), A = c(1,1,4), B = c(2,3,1))
df1 %>% left_join(df2, by = 'name') %>%
mutate(across(ends_with('.y'), ~coalesce(., get(gsub('\\.y', '\\.x', cur_column()))),
.names = "{gsub('.y', '', .col)}"), .keep = 'unused')
#> name A B
#> 1 AA 1 2
#> 2 BB 1 0
#> 3 CC 0 1
Created on 2021-07-06 by the reprex package (v2.0.0)

How to cross tabulate the summary values across same field

This may have solutions/answers available here, but I am unable to find.
Let us assume a simple data like this
x <- data.frame(id = rep(1:3, each = 2),
v1 = c('A', 'B', 'A', 'B', 'A', 'C'))
> x
id v1
1 1 A
2 1 B
3 2 A
4 2 B
5 3 A
6 3 C
Now I want an output of relation of V1 column with itself, but across group on id something like this
v1 A B C
1 A 0 2 1
2 B 2 0 0
3 C 1 0 0
So, I proceeded like this..
library(tidyverse)
#merged the V1 column by itself with all = TRUE
x <- merge(x, x, by = "id", all = T)
# removed same group rows
x <- x[x$v1.x != x$v1.y, ]
# final code
x %>% select(-id) %>%
group_by(v1.x, v1.y) %>%
summarise(val = n()) %>%
pivot_wider(names_from = v1.y, values_from = val, values_fill = 0L, names.sort = T)
# A tibble: 3 x 4
# Groups: v1.x [3]
v1.x A B C
<chr> <int> <int> <int>
1 A 0 2 1
2 B 2 0 0
3 C 1 0 0
My question is that any better/direct method to obtain the cross-table?
How about creating a contingency table with xtabs (which can work with large data sets as well). Then, you can use crossprod on the table and set the diagonal to zero for the final result.
ct <- xtabs(~ id + v1, data = x)
cp <- crossprod(ct, ct)
diag(cp) <- 0
cp
Instead of xtabs you can create a cross-table with simply table as well. As noted by #A5C1D2H2I1M1N2O1R2T1, you can simplify to a nice one-liner equivalent:
"diag<-"(crossprod(table(x)), 0)
Output
v1
v1 A B C
A 0 2 1
B 2 0 0
C 1 0 0

Conditional string concatenate

I have this type of DF
DF
ID V1
1 A
2 V
3 C
4 B
5 L
6 L
I would like to get
ID V1 V2
1 A AA
2 V AV
3 C AC
4 B BB
5 L BL
6 L BL
I would like to concatenate A, B in V1 with other characters in V1.
I used something like this
DF%>%
mutate(V2 = ifelse ((V1 == "A" ), paste ("A", ID), ifelse ((V1 == "B")), paste ("B",V1), "")%>%
V2 = na_if (V2, ""))%>%
fill (V2)
Here is a way using base R
df <- transform(df,
V2 = ave(x = V1,
cumsum(V1 %in% c("A", "B")), #grouping variable
FUN = function(x) paste0(x[1], x)))
Gives
df
# ID V1 V2
#1 1 A AA
#2 2 V AV
#3 3 C AC
#4 4 B BB
#5 5 L BL
#6 6 L BL
You can use %in% to find where A and B is. Use unsplit to replicate them and paste0 to make the new string.
i <- DF$V1 %in% c("A", "B")
DF$V2 <- paste0(unsplit(DF$V1[i], cumsum(i)), DF$V1)
#DF$V2 <- paste0(rep(DF$V1[i], diff(c(which(i), length(i)))), DF$V1) #Alternative
DF
# ID V1 V2
#1 1 A AA
#2 2 V AV
#3 3 C AC
#4 4 B BB
#5 5 L BL
#6 6 L BL
Here is a dplyr solution.
library(dplyr)
DF %>%
mutate(flag = cumsum(V1 %in% c("A", "B"))) %>%
group_by(flag) %>%
mutate(V2 = paste0(first(V1), V1)) %>%
ungroup() %>%
select(-flag)
## A tibble: 6 x 3
# ID V1 V2
# <int> <chr> <chr>
#1 1 A AA
#2 2 V AV
#3 3 C AC
#4 4 B BB
#5 5 L BL
#6 6 L BL

add row based on variable condition in R

I have df as follow
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa
I want add new to when "ID" changes with the value of F for "type" and "other-col" columns
new_df
ID
df
ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
1 F F <- this row added
2 A1 cc
2 B1 aa
2 F F <- this row added
3 A2 aa
how can I do it in R?
thx
This should be doable in a single replacement operation once you know the indexes of where each change occurs. E.g.:
idx <- match(unique(df$ID), df$ID)[-1] - 1
df <- df[sort(c(sequence(nrow(df)),idx)),]
df[seq_along(idx) + idx, c("type","other_col")] <- "F"
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
Where df was:
df <- read.table(text="ID type other_col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa", header=TRUE, stringsAsFactors=FALSE)
An option with group_split and add_row. We can split by 'ID' with group_split into a list of data.frames, then loop through the list with map, add a row as the last row (add_row - by default adds row to the end, but we can control it with .before and .after), then slice out the last row as the last 'ID' didn't need the 'F' row
library(tidyverse)
df1 %>%
group_split(ID) %>%
map_dfr(~ .x %>%
add_row(ID = first(.$ID), type = 'F', `other-col` = 'F')) %>%
slice(-n())
Here is another approach with a similar idea as #akrun's answer.
library(tidyverse)
dat2 <- dat %>%
split(f = .$ID) %>%
map_if(.p = function(x) unique(x$ID) < max(dat$ID),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F"))) %>%
bind_rows()
dat2
# ID type other.col
# 1 1 A1 cc
# 2 1 A2 dd
# 3 1 A3 cc
# 4 1 F F
# 5 2 A1 cc
# 6 2 B1 aa
# 7 2 F F
# 8 3 A2 aa
Data
dat <- read.table(text = "ID type other-col
1 A1 cc
1 A2 dd
1 A3 cc
2 A1 cc
2 B1 aa
3 A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Update
I provided an updated answer to show that if ID column is not integer but character, we can create a new column (ID2 in this case) that is converted to be factor based on ID, and then convert it to integer. The rest of the operation would be similar to the original answer but based on ID2.
library(tidyverse)
dat2 <- dat %>%
mutate(ID2 = as.integer(factor(ID, levels = unique(.$ID)))) %>%
split(f = .$ID2) %>%
map_if(.p = function(x) unique(x$ID2) != unique(last(.)$ID2),
~bind_rows(.x, tibble(ID = unique(.x$ID), type = "F", `other.col` = "F",
ID2 = unique(.x$ID2)))) %>%
bind_rows() %>%
select(-ID2)
dat2
# ID type other.col
# 1 C A1 cc
# 2 C A2 dd
# 3 C A3 cc
# 4 C F F
# 5 A A1 cc
# 6 A B1 aa
# 7 A F F
# 8 B A2 aa
DATA
dat <- read.table(text = "ID type other-col
C A1 cc
C A2 dd
C A3 cc
A A1 cc
A B1 aa
B A2 aa",
header = TRUE, stringsAsFactors = FALSE)
Similar to akrun's answer but in base R. Basically, split dataframe by ID then rbind extra row to each split, then recombine dataframe and remove unrequired last row using head(..., -1) -
head(n = -1,
do.call(rbind,
lapply(split(dat, dat$ID), function(x) {
rbind(x, c(x$ID[1], "F", "F"))
})
)
)
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
Using base R you could do:
cbind(ID=sort(c(dat$ID,unique(dat$ID))),do.call(rbind,by(dat[-1],dat[1],rbind,'F')))
ID type other.col
1.1 1 A1 cc
1.2 1 A2 dd
1.3 1 A3 cc
1.4 1 F F
2.4 2 A1 cc
2.5 2 B1 aa
2.3 2 F F
3.6 3 A2 aa
3.2 3 F F
Or you could do:
do.call(rbind,by(dat,dat$ID,function(x)cbind(ID = unique(x[,1]),rbind(x[-1],"F"))))
inds = head(cumsum(with(rle(df$ID), unlist(lapply(lengths, function(i) c((rep(1, i)), F = 0))))), -1)
df1 = df[inds,]
df1[which(names(inds) == "F"), c("type", "other_col")] = "F"
df1
# ID type other_col
#1 1 A1 cc
#2 1 A2 dd
#3 1 A3 cc
#3.1 1 F F
#4 2 A1 cc
#5 2 B1 aa
#5.1 2 F F
#6 3 A2 aa
A possible approach using data.table:
library(data.table)
m <- setDT(df)[, max(ID)]
df[, if (.BY$ID < m) rbind(.SD, as.list(rep("F", ncol(.SD)))) else .SD, ID]
output:
ID type other-col
1: 1 A1 cc
2: 1 A2 dd
3: 1 A3 cc
4: 1 F F
5: 2 A1 cc
6: 2 B1 aa
7: 2 F F
8: 3 A2 aa
or if you dont mind adding another row at the bottom, code will be shorter: setDT(df)[, rbind(.SD, as.list(rep("F", ncol(.SD)))), ID]

Resources