Data wrangling into long format in R - r

I have a source dataset from a nature article. I was wondering how I could extract the values from rows 4 and 12 into a long data format with the relevant assigned group (i.e. Inefficient/Efficient).
This is the code I have used to get the data into R.
# load the required libraries
library(ggsignif)
library(readxl)
library(svglite)
library(tidyverse)
library(tidyr)
library(dplyr)
# The paper from which the figure is taken is Tasdogen et al. (2020)
# Metabolic heterogeneity confers differences in melanoma metastatic potential
# The figure is 2b and can be accessed at
# https://www.nature.com/articles/s41586-019-1847-2#MOESM3
# The link to the raw data used in the article is given below and directly improted for plotting
url <-'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1847-2/MediaObjects/41586_2019_1847_MOESM3_ESM.xlsx'
#create a dataframe from the Excel data
temp <- tempfile()
download.file(url, temp, mode='wb')
myData <- read_excel(path = temp)
I cant figure out how to insert an image of the dataset but it should show up with the previous code. I need columns 2-31 for efficient and 2 to 37 for inefficient.
I hope that's enough information for people to understand want I'm talking about.

This data is really not structured well for a general read like that, but I'll try to make do:
### myData <- read_excel(...)
Data_wide<- myData[c(2:4,10:12), c(2:37)]
tmp <- as.data.frame(t(Data_wide))
head(tmp)
# V1 V2 V3 V4 V5 V6
# ...2 Efficient #1 0.47699999999999998 Inefficient #1 0.48499999999999999
# ...3 Efficient #2 0.376 Inefficient #2 0.47399999999999998
# ...4 Efficient #3 0.496 Inefficient #3 0.48799999999999999
# ...5 Efficient #4 0.32500000000000001 Inefficient #4 0.45600000000000002
# ...6 Efficient #5 8.8999999999999996E-2 Inefficient #5 0.53100000000000003
# ...7 Efficient #6 4.5999999999999999E-2 Inefficient #6 0.318
tmp <- rbind(tmp[,1:3], setNames(tmp[,4:6], names(tmp)[1:3]))
head(tmp)
# V1 V2 V3
# ...2 Efficient #1 0.47699999999999998
# ...3 Efficient #2 0.376
# ...4 Efficient #3 0.496
# ...5 Efficient #4 0.32500000000000001
# ...6 Efficient #5 8.8999999999999996E-2
# ...7 Efficient #6 4.5999999999999999E-2
tmp <- tmp[complete.cases(tmp),]
tmp$V3 <- as.numeric(tmp$V3)
rownames(tmp) <- NULL
head(tmp,3); tail(tmp,3)
# V1 V2 V3
# 1 Efficient #1 0.477
# 2 Efficient #2 0.376
# 3 Efficient #3 0.496
# V1 V2 V3
# 64 Inefficient #34 0.2451
# 65 Inefficient #35 0.2450
# 66 Inefficient #36 0.2529
With this structure, you can subset (remove V2, though I wonder why you feel it is not important) and rename (colnames(tmp) <- c(...)).

Although it might not be pretty, I believe this would be your solution using only readxl and tidyverse packages:
# Select first set of rows with group and value
set1 <-
myData %>%
filter(row_number() %in% c(2, 4))
# Select second set of rows with group and value
set2 <-
myData %>%
filter(row_number() %in% c(10, 12))
# Join both sets of data, so that all group labels are in one row and all values are in one row.
left_join(set1, set2, by = "Fractional enrichment of glucose m+6 in primary subcutaneous tumors after [U-13C]glucose infusion") %>%
#pivot the table to a long format with group lable and value labels in separate columns
pivot_longer(cols = !`Fractional enrichment of glucose m+6 in primary subcutaneous tumors after [U-13C]glucose infusion`) %>%
# pivot wider to a format with group lable and value labels in separate columns
pivot_wider(names_from = `Fractional enrichment of glucose m+6 in primary subcutaneous tumors after [U-13C]glucose infusion`, values_from = value) %>%
# Remove old column names/numbers
select(-name)
# A tibble: 72 x 2
Group `Glucose m+6`
<chr> <chr>
1 Inefficient 0.48499999999999999
2 Inefficient 0.47399999999999998
3 Inefficient 0.48799999999999999
4 Inefficient 0.45600000000000002
5 Inefficient 0.53100000000000003
6 Inefficient 0.318
7 Inefficient 0.26600000000000001
8 Inefficient 0.30399999999999999
9 Inefficient 0.309
10 Inefficient 0.33
# ... with 62 more rows

A clean way to address your problem is to use the libraries tidyxl and unpivotr.
They may seem rather complicated at first, but it's probably the cleanest way to handle excel files. I left some comments to help you go through it.
I suggest you to have a look at unpivotr vignettes.
# libraries
library(tidyverse)
library(tidyxl)
library(unpivotr)
# download data
url <-'https://static-content.springer.com/esm/art%3A10.1038%2Fs41586-019-1847-2/MediaObjects/41586_2019_1847_MOESM3_ESM.xlsx'
temp <- tempfile()
download.file(url, temp, mode='wb')
# read excel file
myData <- xlsx_cells(path = temp)
# select the sheet
figure1a <- myData %>% filter(sheet == "Figure 1 A")
# you can visualize data in an excel-like format with
# View(rectify(figure1a))
# since the sheet is composed by two tables
# get the top-left corner of each table (where in the first column you find Group)
corners <- figure1a %>% filter(character == "Group")
# partition the spreadsheet based on the corners you just got
# select the rows you will need
partitions <- figure1a %>% filter(row %in% c(3:5, 11:13)) %>% partition(corners)
# get the two partitions and edit them
# with purrr::map it will be easy
df <- partitions$cells %>%
# the first column for each partition shows the headers
map(behead, "left", "header") %>%
# the first row for each partition shows the Group: Efficient/Inefficient
map(behead, "up", "Group") %>%
# the second row for each partition shows the mouse id
# and bind the edited partitions together
map_dfr(behead, "up", "Mouse_ID") %>%
# select the columns we need
select(Group, Mouse_ID, Glucose_m6 = numeric)
# the final result
df
#> # A tibble: 66 x 3
#> Group Mouse_ID Glucose_m6
#> <chr> <chr> <dbl>
#> 1 Efficient #1 0.477
#> 2 Efficient #2 0.376
#> 3 Efficient #3 0.496
#> 4 Efficient #4 0.325
#> 5 Efficient #5 0.089
#> 6 Efficient #6 0.046
#> 7 Efficient #7 0.213
#> 8 Efficient #8 0.082
#> 9 Efficient #9 0.359
#> 10 Efficient #10 0.306
#> # ... with 56 more rows
Created on 2021-11-04 by the reprex package (v2.0.0)

Related

Why doesn't R dplyr arrange sort properly using a vector element within a for loop

I'm having trouble getting r's dplyr::arrange() to sort properly when used in a for loop. I found many posts discussing this issue (like ex.1 with the .by_group=TRUE and using desc() bettter, ex.2 with lists, and ex.3 with filter_all() and %in%). Yet, I'm still having a bit of trouble understanding why I can get the arrange() to work when I use the column name directly but not when I refer to its index position within a vector, which will later be used in a loop to aid data extraction from a larger dataframe.
Here is a reproducible toy data to demonstrate:
set.seed(1)
toy <- data.frame(a=rep(sample(letters[1:5], 4, TRUE)), tf=sample(c("T","F"), 100, TRUE), n1=sample(1:100, 100, TRUE), n2=1:100)
get_it <- colnames(toy)[3:4]
My initial approach so far works with the indexed vector on the select() portion, but fails to sort on the arrange() even with the .by_group option. I also tried adding dplyr::arrange() but not change.
j=1 # pretending this is the 1st pass in the loop
toy %>%
select(a, tf, get_it[j]) %>%
group_by(a) %>%
arrange(desc(get_it[j]), .by_group=TRUE)
a tf n1
<chr> <chr> <int>
a T 21
a T 17
a F 87
a T 90
a T 64
example output truncated
However, I get the intended sorted results when I switch the indexed vector in the arrange() for the same name of the column (select still works fine):
j=1 # pretending this is the 1st pass through the loop
toy %>%
select(a, tf, get_it[j]) %>%
group_by(a) %>%
arrange(desc(n1), .by_group=TRUE)
a tf n1
<chr> <chr> <int>
a F 99
a F 98
a F 96
a F 95
a T 93
example output truncated
Why does the second version work, but not the first? What should I change so that I can loop this through many columns?
Thanks in advance! I appreciate your time!
(minor edit to correct a typo.)
This is "programming with dplyr", use .data for referencing columns by a string:
toy %>%
select(a, tf, get_it[j]) %>%
group_by(a) %>%
arrange(desc(.data[[ get_it[j] ]]), .by_group=TRUE)
# # A tibble: 100 x 3
# # Groups: a [3]
# a tf n1
# <chr> <chr> <int>
# 1 a F 99
# 2 a F 98
# 3 a F 96
# 4 a F 95
# 5 a T 93
# 6 a T 92
# 7 a T 92
# 8 a T 90
# 9 a F 87
# 10 a F 86
# # ... with 90 more rows

R grouped time series correlations with tidyverse

I want time series correlations in a grouped data frame. Here's a sample dataset:
x <- cbind(expand.grid(type = letters[1:4], time = seq(1:4), kind = letters[5:8]), value = rnorm(64)) %>% arrange(type, time, kind)
which produces 64 rows of the variables type, time, kind and value.
I want a time series correlation of the values for each kind grouped by type. Think of each type and time combination as an ordered vector of 4 values. I group by type and time, then arrange by kind, then remove kind.
y <- x %>% group_by(type) %>% arrange(type, time, kind) %>% select(-kind)
I can then group y by type and time and nest such that all the values are together in the data variable, regroup by type only and create a new variable which is the lead data.
z <- y %>% group_by(type, time) %>% nest(value) %>% group_by(type) %>% mutate(ahead = lead(data))
Now I want to run mutate(R = cor(data, ahead)), but I can't seem get the syntax correct.
I've also tried mutate(R = cor(data$value, ahead$value)) and mutate(R = cor(data[1]$value, ahead[1]$value)), to no avail.
The error I get from cor is: supply both 'x' and 'y' or a matrix-like 'x'.
How do I reference the data and ahead variables as vectors to run with cor?
Ultimately, I'm looking for a 16 row data frame with columns type, time, and R where R is a single correlation value.
Thank you for your attention.
We can use map2_dbl from purrr to pass data and ahead at the same time to cor function.
library(dplyr)
z %>%
mutate(R = purrr::map2_dbl(data, ahead, cor)) %>%
select(-data, -ahead)
# type time R
# <fct> <int> <dbl>
# 1 a 1 0.358
# 2 a 2 -0.0498
# 3 a 3 -0.654
# 4 a 4 1
# 5 b 1 -0.730
# 6 b 2 0.200
# 7 b 3 -0.928
# 8 b 4 1
# 9 c 1 0.358
#10 c 2 0.485
#11 c 3 -0.417
#12 c 4 1
#13 d 1 0.140
#14 d 2 -0.448
#15 d 3 -0.511
#16 d 4 1
In base R, we can use mapply
z$R <- mapply(cor, z$data, z$ahead)

How to run a for loop for each group in a dataframe?

This question is similar to this one asked earlier but not quite. I would like to iterate through a large dataset (~500,000 rows) and for each unique value in one column, I would like to do some processing of all the values in another column.
Here is code that I have confirmed to work:
df = matrix(nrow=783,ncol=2)
counts = table(csvdata$value)
p = (as.vector(counts))/length(csvdata$value)
D = 1 - sum(p**2)
The only problem with it is that it returns the value D for the entire dataset, rather than returning a separate D value for each set of rows where ID is the same.
Say I had data like this:
How would I be able to do the same thing as the code above, but return a D value for each group of rows where ID is the same, rather than for the entire dataset? I imagine this requires a loop, and creating a matrix to store all the D values in with ID in one column and the value of D in the other, but not sure.
Ok, let's work with "In short, I would like whatever is in the for loop to be executed for each block of data with a unique value of "ID"".
In general you can group rows by values in one column (e.g. "ID") and then perform some transformation based on values/entries in other columns per group. In the tidyverse this would look like this
library(tidyverse)
df %>%
group_by(ID) %>%
mutate(value.mean = mean(value))
## A tibble: 8 x 3
## Groups: ID [3]
# ID value value.mean
# <fct> <int> <dbl>
#1 a 13 12.6
#2 a 14 12.6
#3 a 12 12.6
#4 a 13 12.6
#5 a 11 12.6
#6 b 12 15.5
#7 b 19 15.5
#8 cc4 10 10.0
Here we calculate the mean of value per group, and add these values to every row. If instead you wanted to summarise values, i.e. keep only the summarised value(s) per group, you would use summarise instead of mutate.
library(tidyverse)
df %>%
group_by(ID) %>%
summarise(value.mean = mean(value))
## A tibble: 3 x 2
# ID value.mean
# <fct> <dbl>
#1 a 12.6
#2 b 15.5
#3 cc4 10.0
The same can be achieved in base R using one of tapply, ave, by. As far as I understand your problem statement there is no need for a for loop. Just apply a function (per group).
Sample data
df <- read.table(text =
"ID value
a 13
a 14
a 12
a 13
a 11
b 12
b 19
cc4 10", header = T)
Update
To conclude from the comments&chat, this should be what you're after.
# Sample data
set.seed(2017)
csvdata <- data.frame(
microsat = rep(c("A", "B", "C"), each = 8),
allele = sample(20, 3 * 8, replace = T))
csvdata %>%
group_by(microsat) %>%
summarise(D = 1 - sum(prop.table(table(allele))^2))
## A tibble: 3 x 2
# microsat D
# <fct> <dbl>
#1 A 0.844
#2 B 0.812
#3 C 0.812
Note that prop.table returns fractions and is shorter than your (as.vector(counts))/length(csvdata$value). Note also that you can reproduce your results for all values (irrespective of ID) if you omit the group_by line.
A base R option would be
df1$value.mean <- with(df1, ave(value, ID))

Create column of a tibble (or data frame) that contains a list from a long-format tibble

I have objects that have varying numbers of events at varying times. This is currently stored in a long format (using tibbles from library(tidyverse)) :
timing_tbl <- tibble(ID = c(101,101,101,102,102,103,103,103,103),
event_time = c(0,4,8,0,6,0,4,9,12))
The real data has thousands of objects, with up to 50 or so events, so I want to make this process as efficient as possible.
I would like to convert this to a pseudo-wide format, where the first column is the patient ID, and the second column is a list of the event times for that object. I can do that where the second column is a column of tibbles in the following way
tmp <- lapply(unique(timing_tbl$ID),
function(x) timing_tbl[timing_tbl$ID == x, "event_time"])
timing_tbl2 <- tibble(unique(timing_tbl$ID),tmp)
> timing_tbl2[1,2]
# A tibble: 1 x 1
tmp
<list>
1 <tibble [3 × 1]>
> timing_tbl2[[1,2]]
# A tibble: 3 x 1
event_time
<dbl>
1 0
2 4.00
3 8.00
I would prefer to store these objects as lists, as I then want to find the “distance” between each pair of objects using the following function, and I worry that extracting the vector from the list adds unnecessary processing, slowing down the calculation.
lap_exp2 <- function(x,y,tau) {
exp(-abs(x - y)/tau)
}
distance_lap2 <- function(vec1,vec2,tau) {
## vec1 is first list of event times
## vec2 is second list of event times
## tau is the decay parameter
0.5*(sum(outer(vec1,vec1,FUN=lap_exp2, tau = tau)) +
sum(outer(vec2,vec2,FUN=lap_exp2, tau = tau))
) -
sum(outer(vec1,vec2,FUN=lap_exp2, tau = tau))
}
distance_lap2(timing_tbl2[[1,2]]$event_time,timing_tbl2[[2,2]]$event_time,2)
[1] 0.8995764
If I try extracting the list instead of the tibble using [[
tmp <- lapply(unique(timing_tbl$ID),
function(x) timing_tbl[[timing_tbl$ID == x, "event_time"]])
I get the following error, which makes sense
Error in col[[i, exact = exact]] : attempt to select more than one element in vectorIndex
Is there a reasonably simple way I can extract the column from the long tibble as a list and store it in the new tibble? Is this even the right way to go about this?
I've found using tidyr::nest a good way to generate the 'list columns' I think you may be after (especially for stuffing in time series-ish sort of data). Hope the following helps!
library(dplyr)
library(tidyr)
library(purrr)
timing_tbl <- tibble(ID = c(101,101,101,102,102,103,103,103,103),
event_time = c(0,4,8,0,6,0,4,9,12))
ID_times <-
timing_tbl %>%
group_by(ID) %>%
nest(.key = "times_df") %>%
split(.$ID) %>%
map(~ .$times_df %>% unlist(use.names = F))
# > ID_times
# $`101`
# [1] 0 4 8
# $`102`
# [1] 0 6
# $`103`
# [1] 0 4 9 12
dists_long <-
names(ID_times) %>%
expand.grid(IDx = ., IDy = .) %>%
filter(IDx != IDy) %>%
rowwise() %>%
mutate(dist = distance_lap2(vec1 = ID_times[[IDx]], vec2 = ID_times[[IDy]], tau = 2))
# # A tibble: 6 x 3
# IDx IDy dist
# <fct> <fct> <dbl>
# 1 102 101 0.900
# 2 103 101 0.981
# 3 101 102 0.900
# 4 103 102 1.68
# 5 101 103 0.981
# 6 102 103 1.68

How to use a statistic function and subsetting data simultaneously in R?

I have data that looks like this (dat)
region muscle protein
head cerebrum 78
head cerebrum 56
head petiole 1
head petiole 2
tail pectoral 3
tail pectoral 4
I want to take the mean of protein values of cerebrum. I tried to look up different ways to subset data here and here. But there does not seem a straightforward way of doing it. Right now, I'm doing this:
datcerebrum <- dat[which(dat$muscle == "cerebrum"),]
mean(datcerebrum$protein)
I try to condense this one line :
mean(dat[which(dat$muscle == "cerebrum"),])
But it throws out a NA with a warning that argument is not numeric or logical. Is there an easy way to achieve this?
We can use aggregate from base R
aggregate(protein ~muscle, dat, mean)
# muscle protein
#1 cerebrum 67.0
#2 pectoral 3.5
#3 petiole 1.5
I'd do this with the tidyverse package dplyr:
library(readr)
library(dplyr)
fwf <- "head cerebrum 78
head cerebrum 56
head petiole 1
head petiole 2
tail pectoral 3
tail pectoral 4"
dat <- read_fwf(fwf, fwf_empty(fwf, col_names = c("region", "muscle", "protein")))
# The above code is just to create your data frame - please provide reproducible data!
dat %>% filter(muscle == "cerebrum") %>% summarise(m = mean(protein))
#> # A tibble: 1 x 1
#> m
#> <dbl>
#> 1 67
You could even do it for every muscle at once:
dat %>% group_by(muscle) %>% summarise(m = mean(protein))
#> # A tibble: 3 x 2
#> muscle m
#> <chr> <dbl>
#> 1 cerebrum 67.0
#> 2 pectoral 3.5
#> 3 petiole 1.5
Solution using data.table:
# Load required library
library(data.table)
# Transform you data into a data.table object
setDT(dat)
# Subset cerebrum and mean protein values
data[muscle == "cerebrum"][, mean(protein)]

Resources