The data frame is like this:
enter image description here
header: system
Row 1: 00000000000000000503_0
Row 2: 00000000000000000503_1
Row 3: 00000000000000000503_2
Row 4: 00000000000000000503_3
Row 5: 000000000000000004e7_0
Row 6: 000000000000000004e7_1
Row 7: 00000000000000000681_0
Row 8: 00000000000000000681_1
Row 9: 00000000000000000681_2
I want to generate a frequency table with the quantities of the code before string "_" such that:
"00000000000000000503" appears 4 times, "000000000000000004e7" appears 2 times, and so on.
How do I do this in R?
Remove everything after underscore and use table to count frequency
table(sub("_.*", "", data$col1))
#Also
#table(sub("(.*)_.*", "\\1", data$col1))
#000000000000000004e7 00000000000000000503 00000000000000000681
# 2 4 3
If final output needs to be a dataframe use stack
stack(table(sub("_.*", "", data$col1)))
# values ind
#1 2 000000000000000004e7
#2 4 00000000000000000503
#3 3 00000000000000000681
data
data <- structure(list(col1 = structure(c(3L, 4L, 5L, 6L, 1L, 2L, 7L,
8L, 9L), .Label = c("000000000000000004e7_0", "000000000000000004e7_1",
"00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3", "00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"), class = "factor")), class = "data.frame",
row.names = c(NA, -9L))
A dplyr-tidyr alternative:
df %>%
tidyr::separate(V3, c("target", "non_target")) %>%
count(target)
# A tibble: 3 x 2
target n
<chr> <int>
1 000000000000000004e7 2
2 00000000000000000503 4
3 00000000000000000681 3
With base:
table(sapply(strsplit(df$system, "_"),"[[", 1))
Data:
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
Another option using the stringr library that is included in tidyverse
> library(tidyverse)
> mydata <- data.frame(system = c("00000000000000000503_0",
"00000000000000000503_1",
"00000000000000000503_2",
"00000000000000000503_3",
"000000000000000004e7_0",
"000000000000000004e7_1",
"00000000000000000681_0",
"00000000000000000681_1",
"00000000000000000681_2"))
> mydata
system
1 00000000000000000503_0
2 00000000000000000503_1
3 00000000000000000503_2
4 00000000000000000503_3
5 000000000000000004e7_0
6 000000000000000004e7_1
7 00000000000000000681_0
8 00000000000000000681_1
9 00000000000000000681_2
> # Split data using str_split
> mydata$leftside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[1]) #split string by the "_" and take first piece
> mydata$rightside <- sapply(mydata$system, function(x) unlist(str_split(x, "_"))[2]) #split string by the "_" and take second piece
>
> mydata
system leftside rightside
1 00000000000000000503_0 00000000000000000503 0
2 00000000000000000503_1 00000000000000000503 1
3 00000000000000000503_2 00000000000000000503 2
4 00000000000000000503_3 00000000000000000503 3
5 000000000000000004e7_0 000000000000000004e7 0
6 000000000000000004e7_1 000000000000000004e7 1
7 00000000000000000681_0 00000000000000000681 0
8 00000000000000000681_1 00000000000000000681 1
9 00000000000000000681_2 00000000000000000681 2
> # alternative tabulate fuction than base::table(). Can Provide nicer options.
> xtabs(data = mydata, formula = ~leftside)
leftside
000000000000000004e7 00000000000000000503 00000000000000000681
2 4 3
A tidyverse answer would be
my_data <- mydata %>%
mutate_if(is.factor, as.character) %>%
mutate(system = gsub('_[^_]*$', '', system)) %>%
group_by(system) %>%
count() %>%
ungroup()
my_data
An option with str_remove and group_by
library(stringr)
library(dplyr)
df %>%
group_by(V3 = str_remove(V3, "_\\d+$")) %>%
summarise(n = n())
# A tibble: 3 x 2
# V3 n
# <chr> <int>
#1 000000000000000004e7 2
#2 00000000000000000503 4
#3 00000000000000000681 3
Or in base R with table and trimws
table(trimws(df$V3, whitespace = "_[0-9]+"))
data
df <- structure(list(V1 = c("Row", "Row", "Row", "Row", "Row", "Row",
"Row", "Row", "Row"), V2 = c("1:", "2:", "3:", "4:", "5:", "6:",
"7:", "8:", "9:"), V3 = c("00000000000000000503_0", "00000000000000000503_1",
"00000000000000000503_2", "00000000000000000503_3", "000000000000000004e7_0",
"000000000000000004e7_1", "00000000000000000681_0", "00000000000000000681_1",
"00000000000000000681_2")), class = "data.frame", row.names = c(NA,
-9L))
Related
I have a dataframe df and want to remove everything including and after the third '-' in the column 'case_id':
df
case_id unit
TCGA-3A-01-03-9441 27
TCGA-9C-01-04-9641 15
TCGA-1E-01-05-9471 6
This is the desired output:
df
case_id unit
TCGA-3A-01 27
TCGA-9C-01 15
TCGA-1E-01 6
We could use str_replace
library(stringr)
library(dplyr)
df1 %>%
mutate(case_id = str_replace(case_id, "^(([^-]+-){2}[^-]+)-.*", "\\1"))
-output
case_id unit
1 TCGA-3A-01 27
2 TCGA-9C-01 15
3 TCGA-1E-01 6
data
df1 <- structure(list(case_id = c("TCGA-3A-01-03-9441", "TCGA-9C-01-04-9641",
"TCGA-1E-01-05-9471"), unit = c(27L, 15L, 6L)),
class = "data.frame", row.names = c(NA,
-3L))
I have two data frames looking like that
data frame 1:
P.X value
OOPA 5
POKA 4
JKIO 3
KOPP 1
data frame 2:
P.X.1 P.X.2 P.X.3 P.X.4 mass
JKIO UIX HOP 56
CX OOPA 44
EDD POKA 13
KOPP FOSI 11
and I want to merge the two data files based on the df1 P.X and df2 P.X.1,P.X.2,P.X.3,P.X.4. So if it the JKIO in P.X.2. appears in the P.X one then merge them in a new data frame in the same row JKIO, 3, 56 as below:
data frame new:
P.X value mass
OOPA 5 44
POKA 4 13
JKIO 3 56
KOPP 1 11
Do you know how can I do it maybe with
merge(df1,df2 by(P.X == P.X.1 | P.X.2 | P.X.3 | P.X.4)
?
The following is one way to achieve your goal. You want to convert df2 to a long-format data and get rows that have more than 1 character. Once you have this data, you merge df1 with the updated df2.
library(dplyr)
library(tidyr)
left_join(df1,
pivot_longer(df2, cols = P.X.1:P.X.4, names_to = "foo",
values_to = "P.X") %>% filter(nchar(P.X) > 0),
by = "P.X") %>%
select(-foo)
P.X value mass
1 OOPA 5 44
2 POKA 4 13
3 JKIO 3 56
4 KOPP 1 11
DATA
df1 <- structure(list(P.X = c("OOPA", "POKA", "JKIO", "KOPP"), value = c(5L,
4L, 3L, 1L)), class = "data.frame", row.names = c(NA, -4L))
df2 <- structure(list(P.X.1 = c("", "", "EDD", "KOPP"), P.X.2 = c("JKIO",
"", "", "FOSI"), P.X.3 = c("UIX", "CX", "POKA", ""), P.X.4 = c("HOP",
"OOPA", "", ""), mass = c(56, 44, 13, 11)), row.names = c(NA,
-4L), class = c("tbl_df", "tbl", "data.frame"))
You could also just do:
df_new <- cbind(df1, df2[,5])
I was wondering if someone here can help me with a lapply question.
Every month, data are extracted and the data frames are named according to the date extracted (01-08-2019,01-09-2019,01-10-2019 etc). The contents of each data frame are similar to the example below:
01-09-2019
ID DOB
3 01-07-2019
5 01-06-2019
7 01-05-2019
8 01-09-2019
01-10-2019
ID DOB
2 01-10-2019
5 01-06-2019
8 01-09-2019
9 01-02-2019
As the months roll on, there are more data sets being downloaded.
I am wanting to calculate the ages of people in each of the data sets based on the date the data was extracted - so in essence, the age would be the date difference between the data frame name and the DOB variable.
01-09-2019
ID DOB AGE(months)
3 01-07-2019 2
5 01-06-2019 3
7 01-05-2019 4
8 01-09-2019 0
01-10-2019
ID DOB AGE(months)
2 01-10-2019 0
5 01-06-2019 4
8 01-09-2019 1
9 01-02-2019 8
I was thinking of putting all of the data frames together in a list (as there are a lot) and then using lapply to calculate age across all data frames. How do I go about calculating the difference between a data frame name and a column?
If I may suggest a slightly differen approach: It might make more sense to compress your list into a single data frame before calculating the ages. Given your data looks something like this, i.e. it is a list of data frames, where the list element names are the dates of access:
$`01-09-2019`
# A tibble: 4 x 2
ID DOB
<dbl> <date>
1 3 2019-07-01
2 5 2019-06-01
3 7 2019-05-01
4 8 2019-09-01
$`01-10-2019`
# A tibble: 4 x 2
ID DOB
<dbl> <date>
1 2 2019-10-01
2 5 2019-06-01
3 8 2019-09-01
4 9 2019-02-01
You can call bind_rows first with parameter .id = "date_extracted" to turn your list into a data frame, and then calculate age in months.
library(tidyverse)
library(lubridate)
tib <- bind_rows(tib_list, .id = "date_extracted") %>%
mutate(date_extracted = dmy(date_extracted),
DOB = dmy(DOB),
age_months = month(date_extracted) - month(DOB)
)
#### OUTPUT ####
# A tibble: 8 x 4
date_extracted ID DOB age_months
<date> <dbl> <date> <dbl>
1 2019-09-01 3 2019-07-01 2
2 2019-09-01 5 2019-06-01 3
3 2019-09-01 7 2019-05-01 4
4 2019-09-01 8 2019-09-01 0
5 2019-10-01 2 2019-10-01 0
6 2019-10-01 5 2019-06-01 4
7 2019-10-01 8 2019-09-01 1
8 2019-10-01 9 2019-02-01 8
This can be solved with lapply as well but we can also use Map in this case to iterate over list and their names after adding all the dataframes in a list. In base R,
Map(function(x, y) {
x$DOB <- as.Date(x$DOB)
transform(x, age = as.integer(format(as.Date(y), "%m")) -
as.integer(format(x$DOB, "%m")))
}, list_df, names(list_df))
#$`01-09-2019`
# ID DOB age
#1 3 0001-07-20 2
#2 5 0001-06-20 3
#3 7 0001-05-20 4
#4 8 0001-09-20 0
#$`01-10-2019`
# ID DOB age
#1 2 0001-10-20 0
#2 5 0001-06-20 4
#3 8 0001-09-20 1
#4 9 0001-02-20 8
We can also do the same in tidyverse
library(dplyr)
library(lubridate)
purrr::imap(list_df, ~.x %>% mutate(age = month(.y) - month(DOB)))
data
list_df <- list(`01-09-2019` = structure(list(ID = c(3L, 5L, 7L, 8L),
DOB = structure(c(3L, 2L, 1L, 4L), .Label = c("01-05-2019", "01-06-2019",
"01-07-2019", "01-09-2019"), class = "factor")), class = "data.frame",
row.names = c(NA, -4L)), `01-10-2019` = structure(list(ID = c(2L, 5L, 8L, 9L),
DOB = structure(c(4L, 2L, 3L, 1L), .Label = c("01-02-2019",
"01-06-2019", "01-09-2019", "01-10-2019"), class = "factor")),
class = "data.frame", row.names = c(NA, -4L)))
It's bad practice to use dates and numbers as dataframe names consider prefix the date with an "x" as shown below in this base R solution:
df_list <- list(x01_09_2019 = `01-09-2019`, x01_10_2019 = `01-10-2019`)
df_list <- mapply(cbind, "report_date" = names(df_list), df_list, SIMPLIFY = F)
df_list <- lapply(df_list, function(x){
x$report_date <- as.Date(gsub("_", "-", gsub("x", "", x$report_date)), "%d-%m-%Y")
x$Age <- x$report_date - x$DOB
return(x)
}
)
Data:
`01-09-2019` <- structure(list(ID = c(3, 5, 7, 8),
DOB = structure(c(18078, 18048, 18017, 18140), class = "Date")),
class = "data.frame", row.names = c(NA, -4L))
`01-10-2019` <- structure(list(ID = c(2, 5, 8, 9),
DOB = structure(c(18170, 18048, 18140, 17928), class = "Date")),
class = "data.frame", row.names = c(NA, -4L))
I have a data frame that looks like this:
# A tibble: 5 x 5
# Groups: Trial [1]
GID Trial pop `1A-1145442` `1A-1158042`
<chr> <chr> <chr> <int> <int>
GID421213 ES1 ES1-5 12 11
GID419903 ES1 ES1-5 22 12
GID3881 ES1 ES1-5 22 22
GID13646 ES1 ES1-5 12 12
GID418846 ES1 ES1-5 22 11
Here is a dput of it :
structure(list(GID = c("GID421213", "GID419903", "GID3881", "GID13646",
"GID418846"), Trial = c("ES1", "ES1", "ES1", "ES1", "ES1"), pop = c("ES1-5",
"ES1-5", "ES1-5", "ES1-5", "ES1-5"), `1A-1145442` = c(12L, 22L,
22L, 12L, 22L), `1A-1158042` = c(11L, 12L, 22L, 12L, 11L)), row.names =
c(NA, -5L), class = c("grouped_df", "tbl_df", "tbl", "data.frame"), vars =
"Trial", drop = TRUE, indices = list(0:4), group_sizes = 5L,
biggest_group_size = 5L, labels = structure(list(Trial = "ES1"), row.names
= c(NA, -1L), class = "data.frame", vars = "Trial", drop = TRUE))
I want to perform a regrouping transformation into a new column from the Trial column just as I did in the past with the pop column using regex operations but now with dplyr. The Trial column consists of ES values from 1 to 38: I would like to group in this fashion ES1-3,ES3-6,ES7-9 and so forth using the dplyr package. I know I could start with df >%> group_by(df,Trial) but from there on I have no idea how I could operate.
library(dplyr)
df %>%
mutate(pop2 = case_when(
Trial == "ES1" | Trial == "ES2" | Trial == "ES3" ~ "ES1-3",
Trial == "ES4" | Trial == "ES5" | Trial == "ES6" ~ "ES4-6"
))
Will return
# A tibble: 5 x 6
# Groups: Trial [1]
GID Trial pop `1A-1145442` `1A-1158042` pop2
<chr> <chr> <chr> <int> <int> <chr>
1 GID421213 ES1 ES1-5 12 11 ES1-3
2 GID419903 ES1 ES1-5 22 12 ES1-3
3 GID3881 ES1 ES1-5 22 22 ES1-3
4 GID13646 ES1 ES1-5 12 12 ES1-3
5 GID418846 ES1 ES1-5 22 11 ES1-3
Given
(df <- data.frame(Trial = paste0("ES", 1:10)))
# Trial
# 1 ES1
# 2 ES2
# 3 ES3
# 4 ES4
# 5 ES5
# 6 ES6
# 7 ES7
# 8 ES8
# 9 ES9
# 10 ES10
We may, using base R, do
size <- 3
groups <- (as.numeric(substring(df$Trial, 3)) - 1) %/% size
(df$newCol <- sprintf("ES%d-%d", 1 + groups * size, size * (1 + groups)))
# [1] "ES1-3" "ES1-3" "ES1-3" "ES4-6" "ES4-6" "ES4-6" "ES7-9" "ES7-9"
# [9] "ES7-9" "ES10-12"
Here as.numeric(substring(df$Trial, 3)) gets the numeric part of df$Trial and converts it to a numeric vector. Subtracting 1 and using %/% then returns the group number for each element of df$Trial, starting from 0. Given a group number, we can easily construct a new column with sprintf.
size is the size of groups. E.g., setting size <- 5 would give values ES1-5, ES6-10, and so on.
Here's a solution that uses parse_number from readr.
df %>%
mutate(grp = cut(parse_number(Trial),
breaks = seq(1, 38, by = 3),
right = FALSE)) %>%
group_by(grp)
This pulls out the number from Trial then cuts to create a grouping variable, which it then groups by. right=FALSE indicates that the interval is closed on the left.
An edit based on a comment below.
df %>%
mutate(grp = cut(parse_number(Trial),
breaks = c(seq(1, 34, by = 3) 38),
right = FALSE),
include.lowest = TRUE) %>%
group_by(grp)
I have a (sample)table like this:
df <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="Gene SYMBOL Values
TP53 2 3.55
XBP1 5 4.06
TP27 1 2.53
REDD1 4 3.99
ERO1L 6 5.02
STK11 9 3.64
HIF2A 8 2.96")
I want to look up the symbols from two different genelists, given here as genelist1 and genelist2:
genelist1 <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="Gene SYMBOL
P4H 10
PLK 7
TP27 1
KTD 11
ERO1L 6")
genelist2 <- read.table(header = TRUE,
stringsAsFactors = FALSE,
text="Gene SYMBOL
TP53 2
XBP1 5
BHLHB 12
STK11 9
TP27 1
UPK 18")
What I want to is to get a new column where I can see in which genelist(s) I can find each of the genes in my dataframe, but when I run the following code it is just the symbols that are repeated in the new columns.
df_geneinfo <- df %>%
join(genelist1,by="SYMBOL") %>%
join(genelist2, by="SYMBOL")
Any suggestions of how to solve this, either to make one new column with the name of the genelists, or to make one column for each of the genelists?
Thanks in advance! :)
For the sake of completeness (and performance with large tables, perhaps), here is a data.table approach:
library(data.table)
rbindlist(list(genelist1, genelist2), idcol = "glid")[, -"Gene"][
setDT(df), on = "SYMBOL"][, .(glid = toString(glid)), by = .(Gene, SYMBOL, Values)][]
Gene SYMBOL Values glid
1: TP53 2 3.55 2
2: XBP1 5 4.06 2
3: TP27 1 2.53 1, 2
4: REDD1 4 3.99 1
5: ERO1L 6 5.02 NA
6: STK11 9 3.64 2
7: HIF2A 8 2.96 NA
rbindlist() creates a data.table from all genelists and adds a column glid to identify the origin of each row. The Gene column is ignored as the subsequent join is only on SYMBOL. Before joining, df is coerced to class data.table using setDT(). The joined result is then aggregated by SYMBOL to exhibit cases where a symbol appears in both genelists which is the case for SYMBOL == 1.
Edit
In case there are many genelists or the full name of the genelist is required instead of just a number, we can try this:
rbindlist(mget(ls(pattern = "^genelist")), idcol = "glid")[, -"Gene"][
setDT(df), on = "SYMBOL"][, .(glid = toString(glid)), by = .(Gene, SYMBOL, Values)][]
Gene SYMBOL Values glid
1: TP53 2 3.55 genelist2
2: XBP1 5 4.06 genelist2
3: TP27 1 2.53 genelist1, genelist2
4: REDD1 4 3.99 NA
5: ERO1L 6 5.02 genelist1
6: STK11 9 3.64 genelist2
7: HIF2A 8 2.96 NA
ls()is looking for objects in the environment the name of which is starting with genelist.... mget() returns a named list of those objects which is passed to rbindlist().
Data
As provided by the OP
df <- structure(list(Gene = c("TP53", "XBP1", "TP27", "REDD1", "ERO1L",
"STK11", "HIF2A"), SYMBOL = c(2L, 5L, 1L, 4L, 6L, 9L, 8L), Values = c(3.55,
4.06, 2.53, 3.99, 5.02, 3.64, 2.96)), .Names = c("Gene", "SYMBOL",
"Values"), class = "data.frame", row.names = c(NA, -7L))
genelist1 <- structure(list(Gene = c("P4H", "PLK", "TP27", "KTD", "ERO1L"),
SYMBOL = c(10L, 7L, 1L, 11L, 4L)), .Names = c("Gene", "SYMBOL"
), class = "data.frame", row.names = c(NA, -5L))
genelist2 <- structure(list(Gene = c("TP53", "XBP1", "BHLHB", "STK11", "TP27",
"UPK"), SYMBOL = c(2L, 5L, 12L, 9L, 1L, 18L)), .Names = c("Gene",
"SYMBOL"), class = "data.frame", row.names = c(NA, -6L))
I just wrote my own function, which replaces the column values:
replace_by_lookuptable <- function(df, col, lookup) {
assertthat::assert_that(all(col %in% names(df))) # all cols exist in df
assertthat::assert_that(all(c("new", "old") %in% colnames(lookup)))
cond_na_exists <- is.na(unlist(lapply(df[, col], function(x) my_match(x, lookup$old))))
assertthat::assert_that(!any(cond_na_exists))
df[, col] <- unlist(lapply(df[, col], function(x) lookup$new[my_match(x, lookup$old)]))
return(df)
}
df is the data.frame, col is a vector of column names which should be replaced using lookup, a data.frame with column "old" and "new".
If you add a listid column to your genelists
genelist1$listid = 1
genelist2$listid = 2
you can then merge your df with the genelists:
merge(df,rbind(genelist1,genelist2),all.x=T, by = "SYMBOL")
Note that ERO1L is SYMBOL 6 in your df and 4 in genelist1, and HIF2A and REDD1 are missing from genelists but REDD1 is symbol 4 in your df (which is ERO1L in genlist1... so I'm a not sure of what output you're expecting in that case.
You could also merge only on Gene names:
merge(df,rbind(genelist1,genelist2),all.x=T, by.x = "Gene", by.y= "Gene")
You could put all of your genlists in a list:
gen_list <- list(genelist1 = genelist1,genelist2 = genelist2)
and compare them to your target data.frame:
cbind(df,do.call(cbind,lapply(seq_along(gen_list),function(x) ifelse( df$Gene %in% gen_list[[x]]$Gene,names(gen_list[x]),NA))))