Combine character columns into new column - r

I'd be very grateful if you could help me with the following as after a few tests I haven't still been able to get the right outcome.
I've got this data:
dd_1 <- data.frame(ID = c("1","2", "3", "4", "5"),
Class_a = c("a",NA, "a", NA, NA),
Class_b = c(NA, "b", "b", "b", "b"))
And I'd like to produce a new column 'CLASS':
dd_2 <- data.frame(ID = c("1","2", "3", "4", "5"),
Class_a = c("a",NA, "a", NA, NA),
Class_b = c(NA, "b", "b", "b", "b"),
CLASS = c("a", "b", "a-b", "b", "b"))
Thanks a lot!

Here it is:
tmp <- paste(dd_1$Class_a, dd_1$Class_b, sep='-')
tmp <- gsub('NA-|-NA', '', tmp)
(dd_2 <- cbind(dd_1, tmp))
First we concatenate (join as strings) the 2 columns. paste treats NAs as ordinary strings, i.e. "NA", so we either get NA-a, NA-b, or a-b. Then we substitute NA- or -NA with an empty string.
Which results in:
## ID Class_a Class_b tmp
## 1 1 a <NA> a
## 2 2 <NA> b b
## 3 3 a b a-b
## 4 4 <NA> b b
## 5 5 <NA> b b

Another option:
dd_1$CLASS <- with(dd_1, ifelse(is.na(Class_a), as.character(Class_b),
ifelse(is.na(Class_b), as.character(Class_a),
paste(Class_a, Class_b, sep="-"))))
This way you would check if any of the classes is NA and return the other, or, if none is NA, return both separated by "-".

Here's a short solution with apply:
dd_2 <- cbind(dd_1, CLASS = apply(dd_1[2:3], 1,
function(x) paste(na.omit(x), collapse = "-")))
The result
ID Class_a Class_b CLASS
1 1 a <NA> a
2 2 <NA> b b
3 3 a b a-b
4 4 <NA> b b
5 5 <NA> b b

Related

Merge if non duplicates

I am trying to merge two data frames (df1 and df2) based on two KEY (KEY1, and KEY2). However in df1, KEY1 is not unique. I want to merge df1 and df2 if KEY1 is unique. I generated a count variable which counts the number of occurence of KEY1, hence I want to merge df1 and df2 only if count equals 1.
Here is an example data frame:
df1$KEY1 <- as.data.frame(c("a", "a", "b", "c", "d"))
df1$count <- as.data.frame(c("2", "2", "1", "1", "1"))
df2$KEY2 <- as.data.frame(c("a", "b", "c", "d", "e"))
df2$value <- as.data.frame(c("85", "25", "581", "12", "4"))
My question is: how to perform the merge only if count equals 1?
df1 <- if(count==1,merge(df1, df2, by.x=KEY1, by.y=KEY2, all.x=TRUE), ?)
My goal is to get this:
df1$KEY1 <- as.data.frame(c("a", "a", "b", "c", "d"))
df1$count <- as.data.frame(c("2", "2", "1", "1", "1"))
df1$value <- as.data.frame(c("NA", "NA", "25", "581", "12"))
You can perform a join and change the values to NA if count is not 1.
library(dplyr)
inner_join(df1, df2, by = c('KEY1' = 'KEY2')) %>%
mutate(value = replace(value, count != 1, NA))
# KEY1 count value
#1 a 2 <NA>
#2 a 2 <NA>
#3 b 1 25
#4 c 1 581
#5 d 1 12
Similarly, in base R -
merge(df1, df2, by.x = 'KEY1', by.y = 'KEY2') |>
transform(value = replace(value, count != 1, NA))
data
df1 <- data.frame(KEY1 = c("a", "a", "b", "c", "d"),
count = c("2", "2", "1", "1", "1"))
df2 <- data.frame(KEY2 = c("a", "b", "c", "d", "e"),
value = c("85", "25", "581", "12", "4"))
If you insist on using base, what you are looking for is the incomparables argument in merge. Values of the key included in it aren't mathched
tab <- table(df1$KEY1)
tab
merge(df1, df2, by.x="KEY1", by.y="KEY2", all.x=TRUE,
incomparables = names(tab)[tab>1])
The output is:
KEY1 count value
1 a 2 <NA>
2 a 2 <NA>
3 b 1 25
4 c 1 581
5 d 1 12
You could use:
library(dplyr)
df1 %>%
mutate(
value = if_else(count == "1" & KEY1 %in% df2$KEY2,
tibble::deframe(df2)[KEY1],
NA_character_)
)
which returns
KEY1 count value
1 a 2 <NA>
2 a 2 <NA>
3 b 1 25
4 c 1 581
5 d 1 12
Or the same as base R:
transform(
df1,
value = ifelse(df1$count == 1,
`names<-`(df2$value, df2$KEY2)[df1$KEY1],
NA_character_)
)
Using data.table
library(data.table)
setDT(df1)[df2, value := NA^(count != 1) * value, on = .(KEY1 = KEY2)]
-output
> df1
KEY1 count value
1: a 2 NA
2: a 2 NA
3: b 1 25
4: c 1 581
5: d 1 12
NOTE: The numeric columns are created as character. Assuming they are of class numeric, do a join on by KEY columns and assign the value to 'df1' after converting to NA based on 'count' column values

Updating old dataframe with new dataframe in R

I am working to update an old dataframe with a data from a new dataframe.
I found this option, it works for some of the fields, but not all. Not sure how to alter that as it is beyond my skill set. I tried removing the is.na(x) portion of the ifelse code and that did not work.
df_old <- data.frame(
bb = as.character(c("A", "A", "A", "B", "B", "B")),
y = as.character(c("i", "ii", "ii", "i", "iii", "i")),
z = 1:6,
aa = c(NA, NA, 123, NA, NA, 12))
df_new <- data.frame(
bb = as.character(c("A", "A", "A", "B", "A", "A")),
z = 1:6,
aa = c(NA, NA, 123, 1234, NA, 12))
cols <- names(df_new)[names(df_new) != "z"]
df_old[,cols] <- mapply(function(x, y) ifelse(is.na(x), y[df_new$z == df_old$z], x), df_old[,cols], df_new[,cols])
The code also changes my bb variable from a character vector to a numeric. Do I need another call to mapply focusing on specific variable bb?
To update the aa and bb columns you can approach this using a join via merge(). This assumes column z is the index for these data frames.
# join on `z` column
df_final<- merge(df_old, df_new, by = c("z"))
# replace NAs with new values for column `aa` from `df_new`
df_final$aa <- ifelse(is.na(df_final$aa.x), df_final$aa.y, df_final$aa.x)
# choose new values for column `bb` from `df_new`
df_final$bb <- df_final$bb.y
df_final<- df_final[,c("bb", "z", "y", "aa")]
df_final
bb z y aa
1 A 1 i NA
2 A 2 ii NA
3 A 3 ii 123
4 B 4 i 1234
5 A 5 iii NA
6 A 6 i 12

Count occurrences per entry in dataframe

I have the following kind of dataframe (this is simplified example):
id = c("1", "1", "1", "2", "3", "3", "4", "4")
bank = c("a", "b", "c", "b", "b", "c", "a", "c")
df = data.frame(id, bank)
df
id bank
1 1 a
2 1 b
3 1 c
4 2 b
5 3 b
6 3 c
7 4 a
8 4 c
In this dataframe you can see that for some ids there are multiple banks, i.e. for id==1, bank=c(a,b,c).
The information I would like to extract from this dataframe is the overlap between id's within different banks and the count.
So for example for bank a: bank a has two persons (unique ids): 1 and 4. For these persons, I want to know what other banks they have
For person 1: bank b and c
For person 4: bank c
the total amount of other banks: 3, for which, b = 1, and c = 2.
So I want to create as output a sort of overlap table as below:
bank overlap amount
a b 1
a c 2
b a 1
b c 2
c a 2
c b 2
Took me a while to get a result, so I post it. Not as sexy as Ronak Shahs but same result.
id = c("1", "1", "1", "2", "3", "3", "4", "4")
bank = c("a", "b", "c", "b", "b", "c", "a", "c")
df = data.frame(id, bank)
df$bank <- as.character(df$bank)
resultlist <- list()
dflist <- split(df, df$id)
for(i in 1:length(dflist)) {
if(nrow(dflist[[i]]) < 2) {
resultlist[[i]] <- data.frame(matrix(nrow = 0, ncol = 2))
} else {
resultlist[[i]] <- as.data.frame(t(combn(dflist[[i]]$bank, 2)))
}
}
result <- setNames(data.table(rbindlist(resultlist)), c("bank", "overlap"))
result %>%
group_by(bank, overlap) %>%
summarise(amount = n())
bank overlap amount
<fct> <fct> <int>
1 a b 1
2 a c 2
3 b c 2
We may use data.table:
df = data.frame(id = c("1", "1", "1", "2", "3", "3", "4", "4"),
bank = c("a", "b", "c", "b", "b", "c", "a", "c"))
library(data.table)
setDT(df)[, .(bank = rep(bank, (.N-1L):0L),
overlap = bank[(sequence((.N-1L):1L) + rep(1:(.N-1L), (.N-1L):1))]),
by=id][,
.N, by=.(bank, overlap)]
#> bank overlap N
#> 1: a b 1
#> 2: a c 2
#> 3: b c 2
#> 4: <NA> b 1
Created on 2019-07-01 by the reprex package (v0.3.0)
Please note that you have b for id==2 which is not overlapping with other values. If you don't want that in the final product, just apply na.omit() on the output.
An option would be full_join
library(dplyr)
full_join(df, df, by = "id") %>%
filter(bank.x != bank.y) %>%
dplyr::count(bank.x, bank.y) %>%
select(bank = bank.x, overlap = bank.y, amount = n)
# A tibble: 6 x 3
# bank overlap amount
# <fct> <fct> <int>
#1 a b 1
#2 a c 2
#3 b a 1
#4 b c 2
#5 c a 2
#6 c b 2
Do you need to cover both banks in both the directions? Since a -> b is same as b -> a in this case here. We can use combn and create combinations of unique bank taken 2 at a time, find out length of common id found in the combination.
as.data.frame(t(combn(unique(df$bank), 2, function(x)
c(x, with(df, length(intersect(id[bank == x[1]], id[bank == x[2]])))))))
# V1 V2 V3
#1 a b 1
#2 a c 2
#3 b c 2
data
id = c("1", "1", "1", "2", "3", "3", "4", "4")
bank = c("a", "b", "c", "b", "b", "c", "a", "c")
df = data.frame(id, bank, stringsAsFactors = FALSE)

R: Replace column depending on match of two other columns

Lets assume there are 2 columns of two huge dataframes (different lengths) like:
df1 df2
A 1 C X
A 1 D X
B 4 C X
A 1 F X
B 4 A X
B 4 B X
C 7 B X
Each time there is a match in the 1st columns, X should be replaced with data from column 2 of df1. If the 1st column of df2 contains Elements, which are still not in the first column of df1 (F, D), X should be replaced with 0.
Hence there is a huge dataframe, a loop in a loop would not be useful.
The solution should look like this:
df1 df2
A 1 C 7
A 1 D 0
B 4 C 7
A 1 F 0
B 4 A 1
B 4 B 4
C 7 B 4
Thank You in advance
As there are duplicate rows in 'df1', we can get the unique rows
df3 <- unique(df1)
Then, use match to get the idnex
i1 <- match(df2$Col1, df3$Col1)
and based on the index, assign
df2$Col2 <- df3$Col2[i1]
If there are no matches, it would be NA, which can be changed to 0
df2$Col2[is.na(df2$Col2)] <- 0
df2
# Col1 Col2
#1 C 7
#2 D 0
#3 C 7
#4 F 0
#5 A 1
#6 B 4
#7 B 4
Or this can be done with data.table by joining on the 'Col1' and assigning the 'Col2' (after removing the Col2 from the second data) with the Col2 from 'df3'
library(data.table)
setDT(df2)[, Col2 := NULL][df3, Col2 := Col2, on = .(Col1)]
data
df1 <- structure(list(Col1 = c("A", "A", "B", "A", "B", "B", "C"), Col2 = c(1,
1, 4, 1, 4, 4, 7)), class = "data.frame", row.names = c(NA, -7L
))
df2 <- structure(list(Col1 = c("C", "D", "C", "F", "A", "B", "B"), Col2 = c("X",
"X", "X", "X", "X", "X", "X")), class = "data.frame", row.names = c(NA,
-7L))

replacing multiple values in data frame in R

I want to reassign multiple different character strings with the same value in a single call. However, the following code only replaces some of values in each variable.
dat <-data.frame(x=c(rep("1=x",4),rep("b",4)),y=c(rep("1=z",4),rep("b",4)))
dat[] <- sapply(dat[], as.character)
dat[dat == c("1=x", "1=y")]<- 1
such that I get:
dat
x y
1 1 1=z
2 1=x 1=z
3 1 1=z
4 1=x 1=z
5 b b
6 b b
7 b b
8 b b
when I want is the following:
dat
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b
With dplyr:
library(dplyr)
dat <- mutate_all(dat, funs(replace(., grepl("1=", .), 1)))
With Base R:
dat[] <- lapply(dat, function(x) replace(x, grepl("1=", x), 1))
Result:
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b
Data:
dat <- structure(list(x = c("1=x", "1=x", "1=x", "1=x", "b", "b", "b",
"b"), y = c("1=z", "1=z", "1=z", "1=z", "b", "b", "b", "b")), .Names = c("x",
"y"), row.names = c(NA, -8L), class = "data.frame")
Another Base R option if you want to make an explicit replacement of certain strings would be:
dat[] <- lapply(dat,function(x) ifelse(x %in% c("1=x", "1=z"), 1, x))
Result:
x y
1 1 1
2 1 1
3 1 1
4 1 1
5 b b
6 b b
7 b b
8 b b
Data:
dat <- structure(list(x = c("1", "1", "1", "1", "b", "b", "b", "b"),
y = c("1", "1", "1", "1", "b", "b", "b", "b")), row.names = c(NA,
-8L), class = "data.frame")

Resources