If string exists, replace cell with NA - r

I have a data frame as follows:
Col1 Col2 Col3 Col4 Col5
U N=>A {N A} NA
V {L E=>e E e}
X M=>P {M P} NA
Y {Z Q=>p Q p}
How do I do the following?
Replace any cells that contain => with NA.
Remove { and } from the data frame.
Final output to look like is this:
Col1 Col2 Col3 Col4 Col5
U NA N A NA
V L NA E e
X NA M P NA
Y Z NA Q p

We can loop over the columns, use grepl to find the elements that have =>, replace it with NA and then replace the additional non-alphabetic characters with gsub
df1[] <- lapply(df1, function(x) gsub("[{}]+", "", replace(x, grepl("=>", x), NA)))
df1
# Col1 Col2 Col3 Col4 Col5
#1 U <NA> N A <NA>
#2 V L <NA> E e
#3 X <NA> M P <NA>
#4 Y Z <NA> Q p
data
df1 <- structure(list(Col1 = c("U", "V", "X", "Y"), Col2 = c("N=>A",
"{L", "M=>P", "{Z"), Col3 = c("{N", "E=>e", "{M", "Q=>p"), Col4 = c("A}",
"E", "P}", "Q"), Col5 = c(NA, "e}", NA, "p}")), .Names = c("Col1",
"Col2", "Col3", "Col4", "Col5"), class = "data.frame", row.names = c(NA,
-4L))

Related

Replace missing value of all the following columns based on the value of preceding column

I have a dataframe like below:
SampleId Col1 Col2 Col3 Col4
1 st1 k p
2 st2 k
3 st3 k p g
4 st4 k p g s
I want the empty rows in the columns to be filled based on the available values from preceding columns, so something like below:
SampleId Col1 Col2 Col3 Col4
1 st1 k p p p
2 st2 k k k k
3 st3 k p g g
4 st4 k p g s
What would be a dplyr way to do this?
You could transform the data to long, replace "" with NA, and fill in missing values with the previous value with fill(). Finally, transform the data back to wide.
library(dplyr)
library(tidyr)
df %>%
pivot_longer(starts_with('Col')) %>%
mutate(value = na_if(value, "")) %>%
fill(value) %>%
pivot_wider()
# # A tibble: 4 × 5
# SampleId Col1 Col2 Col3 Col4
# <chr> <chr> <chr> <chr> <chr>
# 1 st1 k p p p
# 2 st2 k k k k
# 3 st3 k p g g
# 4 st4 k p g s
Data
df <- structure(list(SampleId = c("st1", "st2", "st3", "st4"),
Col1 = c("k", "k", "k", "k"), Col2 = c("p", "", "p", "p"), Col3 = c("", "", "g", "g"),
Col4 = c("", "", "", "s")), class = "data.frame", row.names = c("1", "2", "3", "4"))

Ifelse Statement to Replace Value in Corresponding Row

I have a dataframe like below:
Col1 Col2 COl4 Col5
A B NA NA
M L NA lo
A N NA KE
How do I make the logic where, if Col1 = A, replace NA in COl4 with "Pass"?
When I try using ifelse, I do not get the expected output.
Expected output should be:
Col1 Col2 COl4 Col5
A B Pass NA
M L NA lo
A N Pass KE
I tried this but no luck:
df$COl4<-
ifelse(df$Col1=="A", "Pass", df$COl4)
No real need for ifelse() here. You can use standard index replacement.
df$COl4[df$Col1 == "A"] <- "Pass"
This says that we are replacing COl4 such that Col1 == "A" with "Pass". Additionally, this method will not mess with attributes like ifelse() will.
You can use case_when:
library(tidyverse)
tab <- tibble(Col1 = c("A", "M", "A"), Col2 = c("B", "L", "N"), COl4 = c(NA, NA, NA), Col5 = c(NA, "lo", "KE"))
tab %>%
mutate(COl4 = case_when(
Col1 == "A" ~ "Pass",
TRUE ~ as.character(COl4))
)
# A tibble: 3 x 4
Col1 Col2 COl4 Col5
<chr> <chr> <chr> <chr>
1 A B Pass NA
2 M L NA lo
3 A N Pass KE
The benefit of use case_when is when you have too many conditions!
The TRUE is for the rest of the COl4 that don't need any condition).

R grep search patterns in multiple columns

I have a data frame like as follows:
Col1 Col2 Col3
A B C
D E F
G H I
I am trying to keep lines matching 'B' in 'Col2' OR F in 'Col3', in order to get:
Col1 Col2 Col3
A B C
D E F
I tried:
data[(grep("B",data$Col2) || grep("F",data$Col3)), ]
but it returns the entire data frame.
NOTE: it works when calling the 2 grep one at a time.
Or using a single grepl after pasteing the columns
df1[with(df1, grepl("B|F", paste(Col2, Col3))),]
# Col1 Col2 Col3
#1 A B C
#2 D E F
with(df1, df1[ Col2 == 'B' | Col3 == 'F',])
# Col1 Col2 Col3
# 1 A B C
# 2 D E F
Using grepl
with(df1, df1[ grepl( 'B', Col2) | grepl( 'F', Col3), ])
# Col1 Col2 Col3
# 1 A B C
# 2 D E F
Data:
df1 <- structure(list(Col1 = c("A", "D", "G"), Col2 = c("B", "E", "H"
), Col3 = c("C", "F", "I")), .Names = c("Col1", "Col2", "Col3"
), row.names = c(NA, -3L), class = "data.frame")
The data.table package makes this type of operation trivial due to its compact and readable syntax. Here is how you would perform the above using data.table:
> df1 <- structure(list(Col1 = c("A", "D", "G"), Col2 = c("B", "E", "H"
+ ), Col3 = c("C", "F", "I")), .Names = c("Col1", "Col2", "Col3"
+ ), row.names = c(NA, -3L), class = "data.frame")
> library(data.table)
> DT <- data.table(df1)
> DT
Col1 Col2 Col3
1: A B C
2: D E F
3: G H I
> DT[Col2 == 'B' | Col3 == 'F']
Col1 Col2 Col3
1: A B C
2: D E F
>
data.table performs its matching operations with with=TRUE by default. Note that the matching is much faster if you set keys on the data but that is for another topic.

Determine if Value in Final Column exists in respective rows

I have a dataframe as follows:
df1
ColA ColB ColC ColD ColE COlF ColG Recs
1 A-1 A - 3 B B NA C
1 B-1 C R D E NA B
1 NA A B A B
How do I determine if the last from the column Recs is found in it's respective row?
I tried below but it doesn't work because there are duplicates in my normal dataset:
df1$Exist <- apply(df1, 1, FUN = function(x)
c("No", "Yes")[(anyDuplicated(x[!is.na(x) & x != "" ])!=0) +1])
There are also blanks, NA's, and character values that have spaces and dashes.
Final output should be:
ColA ColB ColC ColD ColE COlF ColG Recs Exist?
1 A-1 A - 3 B B NA C No
1 B-1 C R D E NA B No
1 NA A B A B Yes
Thanks
For efficiency, you could use data.table here.
library(data.table)
setDT(df)[, Exist := Recs %chin% unlist(.SD), .SDcols=-"Recs", by=1:nrow(df)]
which gives
ColA ColB ColC ColD ColE COlF ColG Recs Exist
1: 1 A-1 A-3 B B NA NA C FALSE
2: 1 B-1 C R D E NA B FALSE
3: 1 NA A B A NA B TRUE
Original data:
df <-structure(list(ColA = c(1L, 1L, 1L), ColB = c("A-1", "", NA),
ColC = c("A-3", "B-1", "A"), ColD = c("B", "C R", "B"), ColE = c("B",
"D", "A"), COlF = c(NA, "E", ""), ColG = c(NA, NA, NA), Recs = c("C",
"B", "B")), .Names = c("ColA", "ColB", "ColC", "ColD", "ColE",
"COlF", "ColG", "Recs"), row.names = c(NA, -3L), class = "data.frame")
If I understood you correctly, this should work:
# Compute column index of reference variable
col_ind <- which(colnames(df1) == "Recs")
# Compute boolean vector of presence
present_bool <- apply(df1, 1, function(row) {
any(row[col_ind] == row[-col_ind], na.rm = TRUE)
})
# Create the desired column
df1$Exist <- ifelse(present_bool, "Yes", "No")
exist <- rep(NA, nrow(df1))
for (i in 1:nrow(df1)) {
exist[i] <- df1$Recs[i] %in% df1[i, 1:7]
}
df1 <- cbind(df1, exist)
This should be another way of obtaining the desired result:
f.checkExist <- function(x) {
grepl(df[x, 8], df[x, 1:7])
}
df$exists <- grepl(T, lapply(1:nrow(df), f.checkExist))

R paste0 2 columns if not NA

I would like to paste0 two columns if the element in one column is not NA.If one element of one columns is NA then keep the element of the other column only.
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"), col2 = c(1, NA, 3)), .Names = c("col1", "col2"),
class = "data.frame",row.names = c(NA, -3L))
# col1 col2
# 1 A 1
# 2 B NA
# 3 C 3
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"),col2 = c(1, NA, 3), col3 = c("A|1", "B", "C|3")),
.Names = c("col1", "col2", "col3"), row.names = c(NA,-3L),
class = "data.frame")
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3
you can also do it with regular expressions:
df$col3 <- sub("NA\\||\\|NA", "", with(df, paste0(col1, "|", col2)))
That is, paste them in regular way and then replace any "NA|" or "|NA" with "". Note that | needs to be "double escaped" because it means "OR" in regexps, that's why the strange pattern NA\\||\\|NA means actually "NA|" OR "|NA".
As #Roland says, this is easy using ifelse (just translate the mental logic into a series of nested ifelse statements):
x <- transform(x,col3=ifelse(is.na(col1),as.character(col2),
ifelse(is.na(col2),as.character(col1),
paste0(col1,"|",col2))))
update: need as.character in some cases.
Try:
> df$col1 = as.character(df$col1)
> df$col3 = with(df, ifelse(is.na(col1),col2, ifelse(is.na(col2), col1, paste0(col1,'|',col2))))
> df
col1 col2 col3
1 A 1 A|1
2 B NA B
3 C 3 C|3
You could also do:
library(stringr)
df$col3 <- apply(df, 1, function(x)
paste(str_trim(x[!is.na(x)]), collapse="|"))
df
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3

Resources