I have a dataset like below:
Col1 Col2 Col3
abckel NA 7
jdmelw njabc NA
8 jdken jdne
How do I subset my dataset so that it only keeps rows that contain the string "abc"?
Final Expected Output:
Col1 Col2 Col3
abckel NA 7
jdmelw njabc NA
With your data.frame:
d <- data.frame("Col1" = c("abckel", "jdmelw", 8),
"Col2" = c(NA, "njabc", NA),
"Col3" = c(7, NA, "jdne"),
stringsAsFactors = F)
The following should return your desired result:
d_new <- d[apply(d, 1, function(x) any(grepl("abc", x))), ]
A dplyr solution:
library(dplyr)
df %>% filter_all(any_vars(grepl("abc", .)))
Output:
Col1 Col2 Col3
1: abckel <NA> 7
2: jdmelw njabc <NA>
Related
I have a dataframe such as :
COL1 COL2 COL3 COL4 COL5 COL6 COL7
1 Sp1-2 Sp1-2 Sp3_2-54 Sp3-2 Sp3-2 Sp3-2 SP9-43
2 Sp5-1 Sp5-2 Sp2-4 Sp9-2 Sp10-3 SP9-90 NA
3 Sp_7-3 Sp_7-3 NA SP6-56 Sp2-7 SP3-3 NA
And I would simply like to merge columns when at leats two elements are duplicated.
for example, in COL1 and COL2, Sp1-2 & Sp_7-3 are duplicated in both columns, then I merge it that way by adding a pipe "|" between non-duplicated elements:
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
Here is the dput format :
structure(list(COL1 = c("Sp1-2", "Sp5-1", "Sp_7-3"), COL2 = c("Sp1-2",
"Sp5-2", "Sp_7-3"), COL3 = c("Sp3_2-54", "Sp2-4", NA), COL4 = c("Sp3-2",
"Sp9-2", "SP6-56"), COL5 = c("Sp3-2", "Sp10-3", "Sp2-7"), COL6 = c("Sp3-2",
"SP9-90", "SP3-3"), COL7 = c("SP9-43", NA, NA)), class = "data.frame", row.names = c(NA,
-3L))
Another example :
G136 G348 G465
1 NA NA NA
2 NA NA NA
3 SP4-140 SP4-140 NA
4 SP2-8 NA NA
5 SP3-59 NA NA
6 SP1_contig.682-8 NA SP1_contig.682-8
expected output:
G136|G348|G465
1 NA
2 NA
3 SP4-140
4 SP2-8
5 SP3-59
6 SP1_contig.682-8
the deput format :
dat<- structure(list(G136 = c(NA, NA, "SP4-140", "SP2-8", "SP3-59", "SP1_contig.682-8", NA, NA, NA), G348 = c(NA, NA, "SP4-140", NA, NA, NA, NA, NA, NA), G465 = c(NA, NA, NA, NA, NA, "SP1_contig.682-8", NA, NA, NA)), row.names = c(NA, -9L), class = c("tbl_df", "tbl", "data.frame"))
This is probably best handled by reshaping your data first, then it's straight forward to use various groupings to achieve your desired result:
library(tidyr)
library(dplyr)
dat %>%
rowid_to_column() %>%
pivot_longer(-rowid) %>%
filter(!is.na(value)) %>%
group_by(rowid, value) %>%
mutate(new_name = paste(name, collapse = "|")) %>%
separate_rows(new_name, sep = "\\|") %>%
group_by(name) %>%
mutate(new_name = paste(unique(new_name), collapse = "|")) %>%
group_by(value) %>%
filter(nchar(new_name) == max(nchar(new_name))) %>%
ungroup() %>%
select(-name) %>%
pivot_wider(names_from = new_name, values_from = value, values_fn = ~ paste(unique(.x), collapse = "|")) %>%
complete(rowid = full_seq(c(1, rowid), 1))
# A tibble: 3 × 5
rowid `COL1|COL2` COL3 `COL4|COL5|COL6` COL7
<dbl> <chr> <chr> <chr> <chr>
1 1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 NA
3 3 Sp_7-3 NA SP6-56|Sp2-7|SP3-3 NA
And using the data in your second example gives:
# A tibble: 6 × 2
rowid `G136|G348|G465`
<dbl> <chr>
1 1 NA
2 2 NA
3 3 SP4-140
4 4 SP2-8
5 5 SP3-59
6 6 SP1_contig.682-8
It's really messy...but you may try
library(igraph)
library(stringdist)
library(data.table)
table(df[1,])
d <- c()
for (i in 1:(ncol(df)-1)){
for (j in (i+1):ncol(df)) {
if(any(na.omit(stringdist(df[,i], df[,j], method = "lv") == 0))) {
d <- rbind(d, c(i,j))
}
}
}
dd <- data.table(d)
net <- graph_from_data_frame(d = dd, directed = F)
key <- split(names(V(net)), components(net)$membership)
res <- matrix(NA,nrow = nrow(df), ncol = 0)
names_dummy <- c()
df_dummy <- c()
for (i in key){
i <- as.numeric(i)
names_dummy <- c(names_dummy, paste0(colnames(df)[i], collapse = "|"))
df_dummy <- cbind(df_dummy, apply(df[,i], 1, function(x) {paste0(unique(unlist(x)), collapse = "|")}))
}
colnames(df_dummy) <- names_dummy
df_dummy
res <- cbind(df_dummy, df[,-as.numeric(unlist(key))])
res <- res[,sort(colnames(res))]
res
COL1|COL2 COL3 COL4|COL5|COL6 COL7
1 Sp1-2 Sp3_2-54 Sp3-2 SP9-43
2 Sp5-1|Sp5-2 Sp2-4 Sp9-2|Sp10-3|SP9-90 <NA>
3 Sp_7-3 <NA> SP6-56|Sp2-7|SP3-3 <NA>
Hello coding community,
If my data frame looks like:
ID Col1 Col2 Col3 Col4
Per1 1 2 3 4
Per2 2 NA NA NA
Per3 NA NA 5 NA
Is there any syntax to delete the row associated with ID = Per2, on the basis that Col2, Col3, AND Col4 = NA? I am hoping for code that will allow me to delete a row on the basis that three specific columns (Col2, Col3, and Col4) ALL are NA. This code would NOT delete the row ID = Per3, even though there are three NAs.
Please note that I know how to delete a specific row, but my data frame is big so I do not want to manually sort through all rows/columns.
Big thanks!
Test for NA and delete rows with a number of NA's equal to the number of columns tested using rowSums.
dat[!rowSums(is.na(dat[c('Col2', 'Col3', 'Col4')])) == 3, ]
# ID Col1 Col2 Col3 Col4
# 1 Per1 1 2 3 4
# 3 Per3 NA NA 5 NA
You can use if_all
library(dplyr)
filter(df, !if_all(c(Col2, Col3, Col4), ~ is.na(.)))
# ID Col1 Col2 Col3 Col4
# 1 Per1 1 2 3 4
# 2 Per3 NA NA 5 NA
data
df <- structure(list(ID = c("Per1", "Per2", "Per3"), Col1 = c(1L, 2L,
NA), Col2 = c(2L, NA, NA), Col3 = c(3L, NA, 5L), Col4 = c(4L,
NA, NA)), class = "data.frame", row.names = c(NA, -3L))
Using if_any
library(dplyr)
df %>%
filter(if_any(Col2:Col4, complete.cases))
ID Col1 Col2 Col3 Col4
1 Per1 1 2 3 4
2 Per3 NA NA 5 NA
I have two columns
COL1 COL2
SCS NA
NA NA
NA PB
NA RM
Whenever col1 is na and col2 has a value, I want col2's value to overwrite the na.
Whenever col1 has a value, I want it to stay that value no matter what is in col2.
This could be done with coalesce by specifying COL1 as first argument followed by COL2 so that if there is any NA in COL1 for a particular row, it will be replaced by the corresponding row from COL2
library(dplyr)
data <- data %>%
mutate(COL1 = coalesce(COL1, COL2))
-output
data
COL1 COL2
1 SCS <NA>
2 <NA> <NA>
3 PB PB
4 RM RM
data
data <- structure(list(COL1 = c("SCS", NA, NA, NA), COL2 = c(NA, NA,
"PB", "RM")), class = "data.frame", row.names = c(NA, -4L))
With the data.table package and assuming your data.frame is named df:
library(data.table)
setDT(df)
df[is.na(col1), col1 := col2]
This will replace values in col1 with values in col2 if a col1 value is `na
data <- data%>%
mutate(COL1 = case_when(is.na(COL1) ~ COL2,
TRUE ~ as.character(COL1)))
COL1 COL2
SCS NA
NA NA
PB PB
RM RM
I have a dataframe like below:
Col1 Col2 COl4 Col5
A B NA NA
M L NA lo
A N NA KE
How do I make the logic where, if Col1 = A, replace NA in COl4 with "Pass"?
When I try using ifelse, I do not get the expected output.
Expected output should be:
Col1 Col2 COl4 Col5
A B Pass NA
M L NA lo
A N Pass KE
I tried this but no luck:
df$COl4<-
ifelse(df$Col1=="A", "Pass", df$COl4)
No real need for ifelse() here. You can use standard index replacement.
df$COl4[df$Col1 == "A"] <- "Pass"
This says that we are replacing COl4 such that Col1 == "A" with "Pass". Additionally, this method will not mess with attributes like ifelse() will.
You can use case_when:
library(tidyverse)
tab <- tibble(Col1 = c("A", "M", "A"), Col2 = c("B", "L", "N"), COl4 = c(NA, NA, NA), Col5 = c(NA, "lo", "KE"))
tab %>%
mutate(COl4 = case_when(
Col1 == "A" ~ "Pass",
TRUE ~ as.character(COl4))
)
# A tibble: 3 x 4
Col1 Col2 COl4 Col5
<chr> <chr> <chr> <chr>
1 A B Pass NA
2 M L NA lo
3 A N Pass KE
The benefit of use case_when is when you have too many conditions!
The TRUE is for the rest of the COl4 that don't need any condition).
I would like to paste0 two columns if the element in one column is not NA.If one element of one columns is NA then keep the element of the other column only.
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"), col2 = c(1, NA, 3)), .Names = c("col1", "col2"),
class = "data.frame",row.names = c(NA, -3L))
# col1 col2
# 1 A 1
# 2 B NA
# 3 C 3
structure(list(col1 = structure(1:3, .Label = c("A", "B", "C"),
class = "factor"),col2 = c(1, NA, 3), col3 = c("A|1", "B", "C|3")),
.Names = c("col1", "col2", "col3"), row.names = c(NA,-3L),
class = "data.frame")
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3
you can also do it with regular expressions:
df$col3 <- sub("NA\\||\\|NA", "", with(df, paste0(col1, "|", col2)))
That is, paste them in regular way and then replace any "NA|" or "|NA" with "". Note that | needs to be "double escaped" because it means "OR" in regexps, that's why the strange pattern NA\\||\\|NA means actually "NA|" OR "|NA".
As #Roland says, this is easy using ifelse (just translate the mental logic into a series of nested ifelse statements):
x <- transform(x,col3=ifelse(is.na(col1),as.character(col2),
ifelse(is.na(col2),as.character(col1),
paste0(col1,"|",col2))))
update: need as.character in some cases.
Try:
> df$col1 = as.character(df$col1)
> df$col3 = with(df, ifelse(is.na(col1),col2, ifelse(is.na(col2), col1, paste0(col1,'|',col2))))
> df
col1 col2 col3
1 A 1 A|1
2 B NA B
3 C 3 C|3
You could also do:
library(stringr)
df$col3 <- apply(df, 1, function(x)
paste(str_trim(x[!is.na(x)]), collapse="|"))
df
# col1 col2 col3
#1 A 1 A|1
#2 B NA B
#3 C 3 C|3