Related
I want to calculate the fold change between thyroid and testes dataframe using TPM values and provide the top 10 genes overexpressed in testes tissue (testes$gene_id in the testes dataframe).
In my code below, I first calculated the fold change and store it as a numeric vector tpm.foldchange but then I don't know how to sort the gene_id column of the testes dataframe based on the sorted fold-change values tpm.foldchange.
# Parse the gene results file from the testes and thyroid output
thyroid <- read.table("thyroid.genes.results", header=T, sep="\t")
testes <- read.table("testes.genes.results", header=T, sep="\t")
# Extract the TPM values
# Add one to each value and log them (base 2)
library(tidyverse)
thyroid.tpm <- log(thyroid %>% pull(TPM) + 1)
testes.tpm <- log(testes %>% pull(TPM) + 1)
# Pearson's correlation coefficient between thyroid and testes using TPM
cor(thyroid.tpm, testes.tpm, method="pearson")
# Calculate fold change between the testes and thyroid tissue TPM values and provide top 10 genes that are overexpressed in testes
library(gtools)
tpm.foldchange <- foldchange(testes.tpm, thyroid.tpm)
#tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.foldchange, decreasing=T)
tpm.sortedgenes <- testes[order(factor(testes$TPM, levels=tpm.sorted)),]
tpm.top10genes <- head(tpm.sortedgenes, 10)
testes[order(factor(testes$TPM, levels=tpm.sorted)),]
I initially wanted to sort after merging like this:
tpm.df <- merge(testes.tpm, tpm.foldchange)
tpm.sorted <- sort(tpm.df$tpm.foldchange, decreasing=T)
but it raised an error:
Error: cannot allocate vector of size 8.0 Gb
thyroid dataframe:
# Show only the first 20 rows, first column, and 6th column of thyroid dataframe
dput(thyroid[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(0, 45.96,
2.72, 2.4, 1.67, 5.14, 4.33, 47.68, 81.1, 10.12, 0.96, 45.21,
0, 19.63, 15.06, 0.49, 21.76, 12.16, 19.37, 5.3)), row.names = c(NA,
20L), class = "data.frame")
testes dataframe:
# Show only the first 20 rows, first column, and 6th column of testes dataframe
dput(testes[1:20, c(1,6)])
structure(list(gene_id = c("gene0_DDX11L1", "gene1_WASH7P", "gene100_C1orf233",
"gene1000_ZC3H12A", "gene10000_CD86", "gene10001_CASR", "gene10003_CSTA",
"gene10004_CCDC58", "gene10005_FAM162A", "gene10006_WDR5B", "gene10007_LOC102723582",
"gene10008_KPNA1", "gene1001_MIR6732", "gene10010_PARP9", "gene10011_DTX3L",
"gene10012_PARP15", "gene10015_PARP14", "gene10016_HSPBAP1",
"gene10017_DIRC2", "gene10018_LOC100129550"), TPM = c(2.33, 47.56,
9.45, 2.03, 3.09, 0.11, 3.73, 28.52, 120.65, 6.89, 1.38, 30.89,
0, 20.39, 13.66, 0.59, 9.62, 22.04, 7.42, 2.53)), row.names = c(NA,
20L), class = "data.frame")
Based on Akrun's comment, I've attempted:
library(gtools)
tpm.foldchange <- foldchange(thyroid.tpm, testes.tpm)
testes.sorted <- testes %>%
left_join(thyroid, by="gene_id") %>%
mutate(TPM=testes.tpm, tpm.foldchange, .keep="unused") %>%
slice_max(n=10, order_by=tpm.foldchange)
Output:
> dim(testes.sorted)
[1] 304 15
> dput(testes.sorted[1:10,])
structure(list(gene_id = c("gene10075_LOC101927056", "gene10311_A4GNT",
"gene10394_SLC9A9-AS1", "gene10504_SUCNR1", "gene10511_TMEM14E",
"gene10798_LOC102724550", "gene10990_FLJ42393", "gene11054_DPPA2P3",
"gene11065_GP5", "gene11400_USP17L12"), transcript_id.s..x = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.x = c(659, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.x = c(413.57, 1525.5, 272.62, 1404.5,
1047.5, 2711.5, 2020.5, 900.5, 3247.5, 1347.5), expected_count.x = c(0,
0, 0, 0, 0, 0, 0, 0, 0, 0.12), TPM.x = c(0, 0, 0, 0, 0, 0, 0,
0, 0, 0), FPKM.x = c(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), transcript_id.s..y = c("rna28860_NR_125396.1,rna28861_NR_125395.1",
"rna29540_NM_016161.2", "rna29785_NR_048544.1", "rna30020_NM_033050.4",
"rna30060_NM_001123228.1", "rna30716_NR_110826.1", "rna31241_NR_024413.1",
"rna31390_NR_027764.1", "rna31430_NM_004488.2", "rna32519_NM_001256853.1"
), length.y = c(796, 1771, 518, 1650, 1293, 2957, 2266, 1146,
3493, 1593), effective_length.y = c(535.05, 1510.04, 257.15,
1389.04, 1032.04, 2696.04, 2005.04, 885.04, 3232.04, 1332.04),
expected_count.y = c(9, 3, 2, 233, 2, 2, 36, 2, 35, 1.91),
TPM.y = c(0.58, 0.07, 0.27, 5.8, 0.07, 0.03, 0.62, 0.08,
0.37, 0.05), FPKM.y = c(0.29, 0.03, 0.14, 2.94, 0.03, 0.01,
0.31, 0.04, 0.19, 0.03), TPM = c(0, 0, 0, 0, 0, 0, 0, 0,
0, 0), tpm.foldchange = c(Inf, Inf, Inf, Inf, Inf, Inf, Inf,
Inf, Inf, Inf)), row.names = c(NA, 10L), class = "data.frame")
This code returns a dataframe with (304, 15) dimensions. But I'm only looking for the top ten genes. Also, please note that thyroid.tpm is the log2-transformed TPM values.
If we want to order by the foldchange, do a join first, and arrange based on the foldchange between the 'TPM' columns
library(dplyr)
library(gtools)
testes2 <- testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(across(starts_with("TPM"), ~ log(.x + 1),
.names = "tpm_{.col}")) %>%
mutate(foldchange = foldchange(tpm_TPM.x, tpm_TPM.y)) %>%
filter(is.finite(foldchange)) %>%
arrange(tpm_TPM.x) %>%
dplyr::select(gene_id, TPM = TPM.x, foldchange) %>%
slice_head(n = 10)
If we want to select top 10 foldchange rows, use slice_max
testes %>%
left_join(thyroid, by = 'gene_id') %>%
mutate(TPM = TPM.x, foldchange = foldchange(log(TPM.x + 1), log(TPM.y + 1)),
.keep = "unused") %>%
filter(is.finite(foldchange)) %>%
slice_max(n = 10, order_by = foldchange, with_ties = FALSE)
-output
gene_id TPM foldchange
1 gene100_C1orf233 9.45 1.786222
2 gene10000_CD86 3.09 1.434249
3 gene10007_LOC102723582 1.38 1.288517
4 gene10016_HSPBAP1 22.04 1.217311
5 gene10012_PARP15 0.59 1.162893
6 gene10005_FAM162A 120.65 1.089205
7 gene10010_PARP9 20.39 1.011953
8 gene1_WASH7P 47.56 1.008704
9 gene10011_DTX3L 13.66 -1.033968
10 gene10003_CSTA 3.73 -1.076854
The merge results in memory error because it was done on two vectors creating a cartesian join
Here is my toy data.
df <- tibble::tribble(
~fund, ~dates, ~y, ~x,
"Fund_A", "03/31/2021", 0.04, 0.04,
"Fund_A", "04/30/2021", 0.04, -0.03,
"Fund_A", "05/31/2021", 0.03, 0.04,
"Fund_A", "06/30/2021", -0.01, 0.03,
"Fund_A", "07/31/2021", -0.06, -0.03,
"Fund_A", "08/31/2021", 0.04, 0.05,
"Fund_A", "09/30/2021", 0.01, -0.04,
"Fund_A", "10/31/2021", 0.02, -0.01,
"Fund_A", "11/30/2021", 0.03, -0.03,
"Fund_A", "12/31/2021", -0.02, 0.06,
"Fund_B", "03/31/2021", 0.01, 0.02,
"Fund_B", "04/30/2021", 0.01, 0.05,
"Fund_B", "05/31/2021", 0.05, -0.05,
"Fund_B", "06/30/2021", 0.01, -0.02,
"Fund_B", "07/31/2021", 0.04, 0.09,
"Fund_B", "08/31/2021", 0.02, -0.01,
"Fund_B", "09/30/2021", 0.02, 0.02,
"Fund_B", "10/31/2021", -0.01, 0.01,
"Fund_B", "11/30/2021", 0.05, 0.01,
"Fund_B", "12/31/2021", -0.03, 0.02
)
I have code that runs the rolling regression and spits out the regression output using slider package.
library(tidyverse)
library(slider)
library(broom)
df %>%
group_by(fund) %>%
mutate(model = slide(.x = cur_data(),
.f = possibly(~(lm(y ~ x, data = .x) %>%
tidy() %>%
filter(term != "(Intercept)")),
otherwise = NA),
.before = 5)) %>%
ungroup() %>%
unnest(model)
Now, I want to be able to run the above code with multiple values of funds and ".before" values and combine the results in one dataframe. In other words, I want the above code to work on say .before = seq(4, 7,1). It would be interesting to see an attempt using purrr map!
To carry out the same operation multiple times, we can use a for-loop or an apply function.
To keep the code tidy, I first made a function out of the code to repeat, with the value of .before as a parameter. Then lapply() executes that function multiple times. Then do.call(rbind) binds the resulting dataframes together.
df <- tibble::tribble(
~fund, ~dates, ~y, ~x,
"Fund_A", "03/31/2021", 0.04, 0.04,
...
"Fund_B", "12/31/2021", -0.03, 0.02
)
library('tidyverse')
library('slider')
library('broom')
#
# function that performs the action for a single value for .before; returns a dataframe
# example: calculate_coefficient(df, 4)
#
calculate_lm_values <- function(df, .before) {
df %>%
group_by(fund) %>%
mutate(model = slide(.x = cur_data(),
.f = possibly(~(lm(y ~ x, data = .x) %>%
tidy() %>%
filter(term != "(Intercept)")),
otherwise = NA),
.before = .before),
before = .before) %>%
ungroup() %>%
unnest(model)
}
#
# run function multiple times and bind rows together
#
df_results2 <- map_dfr(4:7, ~calculate_lm_values(df, .x))
# alternatively:
# df_results <- lapply( 4:7, function(x) calculate_lm_values(df, x) )
# df_results <- do.call(rbind, df_results)
df_results
I would like to use something like dplyr to create a subset of data from one data frame using conditions from another data frame. So in one data frame I have a set of data with minimum and maximum years and other sea-level data lsp , and in another frame I have a time series of ocean dynamics. For each row in the lsp dataframe, I would like to extract every year between the minimum and maximum ages in the dynamics data frame and create a sub set of data. I think this will require a for loop. Does anyone have any idea if this is possible?
Desired output using row 1 of LSP as an example:
Row 1 LSP (simplified) is:
Age min
Age max
1997
2007
I want to use this information to create a data frame like this from the dynamics file:
Subset
Year
Dynamics
1997
125
1998
109
1999
152
2000
161
2001
106
2002
120
2003
58
2004
68
2005
110
2006
144
2007
100
Many thanks
## LSP data
structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5,
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5,
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0,
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31,
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889,
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929,
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033,
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544,
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594,
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044,
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232,
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966,
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898,
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926,
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014,
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824,
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA,
-20L))
## Dynamics (only head)
structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732,
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA,
6L), class = "data.frame")
Here is a base R option with Map and subset -
Map(function(x, y) subset(dynamics, Year >= x & Year <= y),
LSP$Age.min, LSP$Age.max)
The same logic can be implemented using tidyverse functions as well.
library(dplyr)
library(purrr)
map2(LSP$Age.min, LSP$Age.max, ~dynamics %>% filter(Year >= .x & Year <= .y))
As long as your dataset isn't huge, I would take something like the following approach.
Add the (nested) dynamics dataset to each row of your lsp dataset
Unnest the dynamics dataset to get one row per year
Filter out years that aren't relevant
(Optional)
Renest the dynamics columns to you have one row per lsp record with a tibble for all relevant years from the dynamics set.
lsp %>%
add_column(dynamics %>% nest(data = everything())) %>%
unnest(data) %>%
filter(year >= min & year <= max) %>%
nest(filtered = c(year, value))
I guess this does what you want to do. First assign names to your input data, so later you know what my codes mean.
lsp <- structure(list(Depth = c(0.5, 1.5, 2.5, 3.5, 4.5, 5.5, 6.5, 7.5,
8.5, 10.5, 13.5, 14.5, 18.5, 19.5, 27.5, 28.5, 32, 35.5, 40.5,
41.5), RSL = c(0.03, 0.03, 0.01, 0.01, -0.04, -0.01, -0.03, 0,
0.04, 0.03, 0, -0.01, -0.05, -0.07, -0.19, -0.24, -0.31, -0.31,
-0.27, -0.29), RSL_err_1sig = c(0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1,
0.1), Age_mean = c(2001.754499, 1994.278776, 1987.678949, 1980.805889,
1973.270485, 1965.018421, 1957.442729, 1952.134369, 1949.031929,
1945.148184, 1939.132213, 1936.957531, 1927.311071, 1924.379033,
1897.26123, 1892.977317, 1876.1995, 1858.135589, 1825.967544,
1820.605298), Age.min = c(1996.752238, 1985.111654, 1977.483594,
1968.26211, 1961.886124, 1958.219318, 1947.496532, 1943.084044,
1941.761439, 1935.843414, 1923.952516, 1920.057048, 1906.228232,
1902.242998, 1875.327613, 1869.925103, 1834.992176, 1811.928966,
1784.998245, 1767.524866), Age.max = c(2006.75676, 2003.445898,
1997.874304, 1993.349668, 1984.654846, 1971.817524, 1967.388926,
1961.184694, 1956.302419, 1954.452954, 1954.31191, 1953.858014,
1948.39391, 1946.515068, 1919.194847, 1916.029531, 1917.406824,
1904.342212, 1866.936843, 1873.68573)), class = "data.frame", row.names = c(NA,
-20L))
dynamics <- structure(list(Year = 1815:1820, dynamics = c(-76.01893261, -64.50519732,
-66.06270761, -76.22822397, -72.35960029, -77.34157443)), row.names = c(NA,
6L), class = "data.frame")
Then the actual codes to get the subset.
# first get info of years from the "lsp" dataset
# following your example in your comments
year_min <- list()
year_max <- list()
all_years <- list()
for(i in 1:nrow(lsp)){
year_min[[i]] <- round(lsp$Age.min[[i]])
year_max[[i]] <- round(lsp$Age.max[[i]])
all_years[[i]] <- c(year_min[[i]]:year_max[[i]])
all_years[[i]] <- as.data.frame(all_years[[i]])
colnames(all_years[[i]]) <- "Year"
}
# now join the info on "Year" from "lsp" data with "dynamics" data to get the subset
library(dplyr)
subset_output <- list()
for (i in 1:length(all_years)){
subset_output[[i]] <- left_join(dynamics,all_years[[i]])
}
My data looks like this:
> dput(head(CORt, 5))
structure(list(rDate = structure(c(1438019100, 1438019400, 1438019700,
1438020000, 1438020300), class = c("POSIXct", "POSIXt"), tzone = "UTC"),
I630 = c(0.536, 0.506, 0.419, 0.456, 0.427), I800 = c(0.414,
0.388, 0.339, 0.351, 0.331), I532 = c(0.547, 0.534, 0.463,
0.488, 0.464), I570 = c(0.522, 0.508, 0.467, 0.468, 0.445
), WR630 = c(0.0127, 0.0573, 0.0083, 0.0057, 0.0053), WR800 = c(0.0144,
0.0506, 0.0249, 0.0163, 0.0159), WR532 = c(0.0139, 0.0394,
0.006, 0.005, 0.0049), WR570 = c(0.0176, 0.0379, 0.0094,
0.0054, 0.0049), NR630 = c(0.006, 0.034, 0.006, 0.004, 0.004
), NR800 = c(0.007, 0.04, 0.019, 0.02, 0.019), NR532 = c(0.007,
0.072, 0.01, 0.007, 0.007), NR570 = c(0.009, 0.077, 0.008,
0.007, 0.007), ER630 = c(0.0351, 0.0746, 0.0116, 0.0055,
0.0052), ER800 = c(0.0278, 0.0596, 0.03, 0.0324, 0.0303),
ER532 = c(0.04, 0.085, 0.013, 0.008, 0.008), ER570 = c(0.034,
0.083, 0.013, 0.009, 0.008)), row.names = c(NA, 5L), class = "data.frame")
In the CORt dataframe when the values of WR630 > I630 I want to turn all values of that row(s) into NA but I want to preserve the rDate column dates and the ER532 values of that row(s).
I have been using this code (example):
which(CORt$WR630>CORt$I630)
CORt[c(7632, 12530, 13684, 14260, 18295, 19735, 23770, 24634, 27529, 44055), setdiff(names(CORt), c("rDate", "ER532"))] <- NA
but this is not handy when I have 200 lines, for example. I'm looking for a code that will turn the row values when WR630 > I630 into NA directly.
Any help is much appreciated.
You can use the which command instead of typing output all the row numbers manually.
CORt[which(CORt$WR630>CORt$I630),setdiff(names(CORt), c("rDate", "ER532"))] <- NA
If you don't have any missing values in the data you can also skip which.
CORt[CORt$WR630>CORt$I630,setdiff(names(CORt), c("rDate", "ER532"))] <- NA
How does this work for you?
nrow(data) %>% map(
.f = function(i) {
if (data[i,"WR630"] > data[i,"I630"] ) {
data[i,-c(1,16)] <- NA
}
data
}
)
Please note that it uses index numbering, instead of names to avoid setting rDate and ER532 to NA. In the data you provided, I didnt find any cases where you condition held true, so I tested it reversely to be certain it works.
I have function which is an extension of an earlier question here
Function to calculate median by column to an R dataframe that is done regularly to multiple dataframes
my function below
library(outliers)
MscoreMax <- 3
scores_na <- function(x, ...) {
not_na <- !is.na(x)
scores <- rep(NA, length(x))
scores[not_na] <- outliers::scores(na.omit(x), ...)
scores
}
mediansFunction <- function(x){
labmedians <- sapply(x[-1], median)
median_of_median <- median(labmedians)
grand_median <- median(as.matrix(x[-1]))
labMscore <- as.vector(round(abs(scores_na(labmedians, "mad")), digits = 2)) #calculate mscore by lab
labMscoreIndex <- which(labMscore > MscoreMax) #get the position in the vector that exceeds Mscoremax
x[-1][labMscoreIndex] <- NA # discharge values above threshold by making NA
return(x)
}
the function has the desired outcome of converting my Mscore values above the threshold to NA. However, I would like to send
labmedians
grand_median
labMscore
As their own variables to the global environment from within the function, but not as a list of items as 3 variables. Can i do this or is better to create a second function which is slightly different that sends the variables to the global environment as a function then use list2env outside the function afterwards to extract the variables as seperate items?
my df below
structure(list(Determination_No = 1:6, `2` = c(0.08, 0.08, 0.08,
0.08, 0.08, 0.08), `3` = c(0.08, 0.07, 0.07, 0.08, 0.07, 0.07
), `4` = c(0.07, 0.08, 0.08, 0.08, 0.07, 0.08), `5` = c(0.08,
0.08, 0.08, 0.08, 0.09, 0.09), `7` = c(0.09, 0.09, 0.11, 0.1,
0.1, 0.1), `8` = c(0.086, 0.087, 0.086, 0.09, 0.083, 0.079),
`10` = c(0.049748274, 0.049748274, 0.066331032, 0.066331032,
0.066331032, 0.049748274), `12` = c(0.086, 0.078, 0.078,
0.077, 0.077, 0.068)), class = "data.frame", row.names = c(NA,
-6L))
It is not recommended to write to global environment from inside the function. If you want to create multiple objects in the global environment return a named list from the function and use list2env.
mediansFunction <- function(x){
labmedians <- sapply(x[-1], median)
median_of_median <- median(labmedians)
grand_median <- median(as.matrix(x[-1]))
labMscore <- as.vector(round(abs(scores_na(labmedians, "mad")), digits = 2)) #calculate mscore by lab
labMscoreIndex <- which(labMscore > MscoreMax) #get the position in the vector that exceeds Mscoremax
x[-1][labMscoreIndex] <- NA # discharge values above threshold by making NA
dplyr::lst(data = x, labmedians, grand_median, labMscore)
}
result <- mediansFunction(df)
list2env(result, .GlobalEnv)
Now you have variables data, labmedians, grand_median and labMscore in the global environment.