If my data looks like this:
q2_3 q2_4 q2_5
<chr> <chr> <chr>
1 1A 2B 3C
2 4D 5E 6F
How can I delete only texts?
I want only numbers to remain!
You could remove all the characters which are not digits using \\D.
Using dplyr
library(dplyr)
df %>% mutate_all(~gsub('\\D', '', .))
# q2_3 q2_4 q2_5
#1 1 2 3
#2 4 5 6
Or in base R :
df[] <- lapply(df, function(x) gsub('\\D', '', x))
data
df <- structure(list(q2_3 = c("1A", "4D"), q2_4 = c("2B", "5E"), q2_5 = c("3C",
"6F")), class = "data.frame", row.names = c("1", "2"))
Also you can use parse_number() from readr package (which will extract first number from values):
library(readr)
data <- data.frame(q2_3 = c("1A", "4D"),
q2_4 = c("2B", "5E"),
q2_5 = c("3C", "6F"))
data[] <- lapply(data, parse_number)
results in
> print(data)
q2_3 q2_4 q2_5
1 1 2 3
2 4 5 6
Another option with mutate/across
library(dplyr)
library(stringr)
df1 %>%
mutate(across(everything(), str_remove_all, "\\D+"))
# q2_3 q2_4 q2_5
#1 1 2 3
#2 4 5 6
data
df1 <- structure(list(q2_3 = c("1A", "4D"), q2_4 = c("2B", "5E"), q2_5 = c("3C",
"6F")), class = "data.frame", row.names = c("1", "2"))
Related
I am working with a data frame in R in which a column contain gene IDs separated by bars that look like the following:
geneIDs <- c("100/1000/100008586","1277/63923/8516","1133/1132/1956/8516")
#> geneIDs
#> [1] "100/1000/100008586" "1277/63923/8516" "1133/1132/1956/8516"
I need to convert each of the different geneIDs to Gene Symbol based on a data.frame that contains in each row the geneID and its correspondent Gene Symbol, as depicted bellow:
#> head(gene_symbols)
ENTREZID SYMBOL
1 1 A1BG
2 10 NAT2
3 100 ADA
4 1000 CDH2
5 10000 AKT3
6 100008586 GAGE12F
Using the first element from the geneIDs as an example, my expected outcome would look like:
#> geneIDs
#> [1] "ADA/CDH2/GAGE12F"
Thank you very much in advance!
Possible solution:
geneIDs <- c("100/1000/100008586","1277/63923/8516","1133/1132/1956/8516")
lookupTable <- structure(list(ENTREZID = c(1L, 10L, 100L, 1000L, 10000L, 100008586L
), SYMBOL = c("A1BG", "NAT2", "ADA", "CDH2", "AKT3", "GAGE12F"
)), row.names = c(NA, -6L), class = c("data.table", "data.frame"
)) %>%
mutate(ENTREZID = as.character(ENTREZID))
as_tibble(x = geneIDs) %>%
mutate(value = strsplit(geneIDs, split = "/")) %>%
unnest_longer(value) %>%
left_join(lookupTable, by = c("value" = "ENTREZID"))
Which gives:
# A tibble: 10 × 2
value SYMBOL
<chr> <chr>
1 100 ADA
2 1000 CDH2
3 100008586 GAGE12F
4 1277 NA
5 63923 NA
6 8516 NA
7 1133 NA
8 1132 NA
9 1956 NA
10 8516 NA
Or to return exactly what you specified:
geneString <- as_tibble(x = geneIDs) %>%
mutate(value = strsplit(geneIDs, split = "/")) %>%
unnest_longer(value) %>%
left_join(lookupTable, by = c("value" = "ENTREZID")) %>%
filter(!is.na(SYMBOL)) %>%
pull(SYMBOL)
paste(geneString, collapse = "/")
"ADA/CDH2/GAGE12F"
You could split the strings at the / and match each to the ENTREZID column to look up the SYMBOL. Replace any non-matches with the original string fragment, and paste the result together, collapsing with "/"
sapply(strsplit(geneIDs, '/'), function(x) {
y <- gene_symbols$SYMBOL[match(x, gene_symbols$ENTREZID)]
y[is.na(y)] <- x[is.na(y)]
paste0(y, collapse = '/')
})
#> [1] "ADA/CDH2/GAGE12F" "1277/63923/8516" "1133/1132/1956/8516"
You can do this:
library(tidyverse)
geneIDs %>%
map(~ {vec <- df$SYMBOL[df$ENTREZID %in% unlist(str_split(.x, '/'))]
if(length(vec) > 0) {
paste(vec, collapse = '/')
}}) %>%
keep(~ length(.x) > 0)
[[1]]
[1] "ADA/CDH2/GAGE12F"
Perhaps gsubfn can be used here
library(gsubfn)
library(tibble)
gsubfn("\\d+", as.list(deframe(gene_symbols)), geneIDs)
[1] "ADA/CDH2/GAGE12F" "1277/63923/8516" "1133/1132/1956/8516"
data
gene_symbols <- structure(list(ENTREZID = c(1L, 10L, 100L, 1000L,
10000L, 100008586L
), SYMBOL = c("A1BG", "NAT2", "ADA", "CDH2", "AKT3", "GAGE12F"
)), class = "data.frame", row.names = c("1", "2", "3", "4", "5",
"6"))
geneIDs <- c("100/1000/100008586","1277/63923/8516","1133/1132/1956/8516")
I have a data frame with very annoying variable names; basically my observations of a same variable are distributed in variables g1_param1, g2_param1, g3_param1, etc. And I would like to join all of them into 1 variable/column named param1.
g1_param1
g2_param1
g1_param2
g2_param2
NA
7
NA
4
1
NA
1
NA
I have some ideas using grepl but that seems quite complicated, for something that can probably be done more easily?
Perhaps this?
no_g <- split(names(dat), sub(".*_", "", names(dat)))
no_g
# $param1
# [1] "g1_param1" "g2_param1"
# $param2
# [1] "g1_param2" "g2_param2"
as.data.frame(
Map(function(nm, cols) do.call(dplyr::coalesce, dat[cols]), names(no_g), no_g)
)
# param1 param2
# 1 7 4
# 2 1 1
You can replace dplyr::coalesce with data.table::fcoalesce or a home-grown function if you don't have dplyr available. For instance,
my_coalesce <- function(...) {
dots <- list(...)
if (length(dots) == 0) return()
if (length(dots) == 1) return(dots[[1]])
out <- dots[[1]]
for (i in seq_along(dots)[-1]) {
if (!any(isna <- is.na(out))) break
out[isna] <- dots[[i]][isna]
}
out
}
(Under-tested, it "should work", but it is likely better if you use something a bit more time-proven. Notably, it does not verify that all lengths of dots are equal, as they must be in this operation. Or that any of them have length at all.)
Data:
dat <- structure(list(g1_param1 = c(NA, 1L), g2_param1 = c(7L, NA), g1_param2 = c(NA, 1L), g2_param2 = c(4L, NA)), class = "data.frame", row.names = c(NA, -2L))
library(tidyr)
library(stringr)
df <- structure(list(g1_param1 = c(NA, 1L), g2_param1 = c(7L, NA),
g1_param2 = c(NA, 1L), g2_param2 = c(4L, NA)), row.names = c(NA,
-2L), class = "data.frame")
df
#> g1_param1 g2_param1 g1_param2 g2_param2
#> 1 NA 7 NA 4
#> 2 1 NA 1 NA
df %>%
pivot_longer(everything(),
names_transform = ~ str_extract(.x, 'param\\d+'),
names_to = '.value',
values_drop_na = TRUE)
#> # A tibble: 2 × 2
#> param1 param2
#> <int> <int>
#> 1 7 4
#> 2 1 1
Created on 2022-09-09 with reprex v2.0.2
I have a data.frame (df1) and I want to include a single, most recent age for each of my samples from another data.frame (df2):
df1$age <- df2$age_9[match(df1$Sample_ID, df2$Sample_ID)]
The problem is that in df2 there are 9 columns for age, as each one indicates the age at a specific check-up date (age_1 is from the first visit, age_9 is the age at the 9th visit) and patients dont make all their visits.
How do I add the most recently obtained age from a non empty check up date?
aka, if age_9 == "." replace "." with age_8 then if age_8 == "." replace "." with age_7 ... etc
From this:
View(df1)
Sample Age
1 50
2 .
3 .
To:
View(df1)
Sample Age
1 50
2 49
3 30
From the data df2
View(df2)
Sample Age_1 Age_2 Age_3
1 40 42 44
2 35 49 .
3 30 . .
This is my attempt:
df1$age[which(df1$age == ".")] <- df2$age_8[match(df1$Sample_ID, df2$Sample_ID)]
With base R, we can use max.col to return the last column index for each row, where the 'Age' columns are not ., cbind with sequence of rows to return a row/column index, extract the elements and change the 'Age' column in 'df1', where the 'Age' is .
df1$Age <- ifelse(df1$Age == ".", df2[-1][cbind(seq_len(nrow(df2)),
max.col(df2[-1] != ".", "last"))], df1$Age)
df1 <- type.convert(df1, as.is = TRUE)
-output
df1
# Sample Age
#1 1 50
#2 2 49
#3 3 30
or using tidyverse by reshaping into 'long' format and then do a join after sliceing the last row grouped by 'Sample'
library(dplyr)
library(tidyr)
df2 %>%
mutate(across(starts_with('Age'), as.integer)) %>%
pivot_longer(cols = starts_with('Age'), values_drop_na = TRUE) %>%
group_by(Sample) %>%
slice_tail(n = 1) %>%
ungroup %>%
select(-name) %>%
right_join(df1) %>%
transmute(Sample, Age = coalesce(as.integer(Age), value))
-output
# A tibble: 3 x 2
# Sample Age
# <int> <int>
#1 1 50
#2 2 49
#3 3 30
data
df1 <- structure(list(Sample = 1:3, Age = c("50", ".", ".")),
class = "data.frame",
row.names = c(NA,
-3L))
df2 <- structure(list(Sample = 1:3, Age_1 = c(40L, 35L, 30L), Age_2 = c("42",
"49", "."), Age_3 = c("44", ".", ".")), class = "data.frame",
row.names = c(NA,
-3L))
I'm trying to count the frequency of dates matching in a data frame.
df1 <- data.frame(c('1991-01-09', '1991-01-11', '1991-02-17'))
df2 <- data.frame(c('1991-01-09', '1991-01-09', '1991-02-17'))
The result would be the following:
Date Freq
1991-01-09 2
1991-01-11 0
1991-02-17 1
df1$count <- rowSums(outer(df1$d, df2$d, `==`))
df1
# d count
# 1 1991-01-09 2
# 2 1991-01-11 0
# 3 1991-02-17 1
Data
df1 <- structure(list(d = c("1991-01-09", "1991-01-11", "1991-02-17")), row.names = c(NA, -3L), class = "data.frame")
df2 <- structure(list(d = c("1991-01-09", "1991-01-09", "1991-02-17")), class = "data.frame", row.names = c(NA, -3L))
Using sapply :
stack(sapply(df1$col1, function(x) sum(df2$col2 == x)))
# values ind
#1 2 1991-01-09
#2 0 1991-01-11
#3 1 1991-02-17
or you could use purrr::map_dbl()
data.frame("Date" = df1[, 1],
"Freq" = purrr::map_dbl(df1[, 1], ~sum(.x == df2[, 1]))
)
Date Freq
1 1991-01-09 2
2 1991-01-11 0
3 1991-02-17 1
I want perform join.
df1=structure(list(id = 1:3, group_id = c(10L, 20L, 40L)), class = "data.frame", row.names = c(NA,
-3L))
df2 has another structure, in group_id's field contain many groups. For examle {10,100,400}
so dput()
df2=structure(list(id = 1:3, group_id = structure(c(1L, 3L, 2L), .Label = c("{`10`,100,`40`}",
"{3,`40`,600,100}", "{4}"), class = "factor")), class = "data.frame", row.names = c(NA,
-3L))
df2 has group_id 10 and 40,but they are in braces together with other groups.
How get desired joined output
id group_id
1 10
1 40
3 40
You can clean group_id in df2 using gsub, bring each id in separate rows and filter.
library(dplyr)
df2 %>%
mutate(group_id = gsub('[{}`]', '', group_id)) %>%
tidyr::separate_rows(group_id) %>%
filter(group_id %in% df1$group_id)
# id group_id
#1 1 10
#2 1 40
#3 3 40
Here's a data.table alternative:
df2[, strsplit(gsub('[{}`]', '', group_id), ','), by = id][V1 %in% df1$group_id]
# id V1
#1: 1 10
#2: 1 40
#3: 3 40
here is an option with base R using regmatches/regexpr
subset(setNames(stack(setNames(regmatches(df2$group_id, gregexpr("\\d+", df2$group_id)),
df2$id))[2:1], c('id', 'group_id')), group_id %in% df1$group_id)
# id group_id
#1 1 10
#3 1 40
#6 3 40