Related
After merging three datasets I've got a mess. There is a unique id field and then there can be one or more samples associated with each id. So far I've got
samples <- structure(list(id = c(1029459, 1029459, 1029459, 1029459, 1030272,
1030272, 1030272, 1032157, 1032157, 1032178, 1032178, 1032219,
1032219, 1032229, 1032229, 1032494, 1032494, 1032780, 1032780
), sample1 = c(853401, 853401, 853401, 853401, 852769, 852769,
852769, 850161, 850161, 852711, 852711, 852597, 852597, 850363,
850363, 850717, 850717, 848763, 848763), sample2 = c(853401,
853693, 853667, 853667, 852769, 853597, 853597, NA, NA, 852711,
853419, 852597, 852597, 850363, 852741, 850717, 851811, 848763,
848763), sample3 = c(NA, NA, NA, NA, NA, NA, NA, 853621, 852621,
NA, NA, NA, NA, NA, NA, NA, NA, NA, NA)), class = c("tbl_df",
"tbl", "data.frame"), row.names = c(NA, -19L))
print(samples)
#> # A tibble: 19 × 4
#> id sample1 sample2 sample3
#> <dbl> <dbl> <dbl> <dbl>
#> 1 1029459 853401 853401 NA
#> 2 1029459 853401 853693 NA
#> 3 1029459 853401 853667 NA
#> 4 1029459 853401 853667 NA
#> 5 1030272 852769 852769 NA
#> 6 1030272 852769 853597 NA
#> 7 1030272 852769 853597 NA
#> 8 1032157 850161 NA 853621
#> 9 1032157 850161 NA 852621
#> 10 1032178 852711 852711 NA
#> 11 1032178 852711 853419 NA
#> 12 1032219 852597 852597 NA
#> 13 1032219 852597 852597 NA
#> 14 1032229 850363 850363 NA
#> 15 1032229 850363 852741 NA
#> 16 1032494 850717 850717 NA
#> 17 1032494 850717 851811 NA
#> 18 1032780 848763 848763 NA
#> 19 1032780 848763 848763 NA
I'd like to get it so that all unique samples per id are combined into one sample column with a long dataframe. eg
id sample
1029459 853401
1029459 853693
1030272 852769
1030272 853597
1032157 850161
1032157 853621
Any ideas?
Is this what you are looking for? For example:
id 1029459 has the unique samples 853401 , 853693 , 853667.
samples %>%
pivot_longer(
c(sample1, sample2, sample3)
,names_to = "sample") %>%
count(id, sample, value) %>%
drop_na() %>%
distinct(id, value) %>%
rename(sample = value)
# A tibble: 16 × 2
id sample
<dbl> <dbl>
1 1029459 853401
2 1029459 853667
3 1029459 853693
4 1030272 852769
5 1030272 853597
6 1032157 850161
7 1032157 852621
8 1032157 853621
9 1032178 852711
10 1032178 853419
11 1032219 852597
12 1032229 850363
13 1032229 852741
14 1032494 850717
15 1032494 851811
16 1032780 848763
You can do it this way:
library(data.table)
unique(melt(setDT(samples), "id",value.name = "sample")[!is.na(sample),c(1,3)])
Output:
id sample
1: 1029459 853401
2: 1030272 852769
3: 1032157 850161
4: 1032178 852711
5: 1032219 852597
6: 1032229 850363
7: 1032494 850717
8: 1032780 848763
9: 1029459 853693
10: 1029459 853667
11: 1030272 853597
12: 1032178 853419
13: 1032229 852741
14: 1032494 851811
15: 1032157 853621
16: 1032157 852621
I will start off by stating that I have working code, but it is embarrassingly inefficient and clumsy. I was hoping that someone in the community might be able to show me a better way to unnest this heavily nested list.
As a background, it is transaction data on nfts that is heavily nested. I am just trying to get a data frame out, ultimately down to the daily level. I have managed to get the code working for the totalPriceUSD field, but as I mentioned, it is clumsy.
library(dplyr)
library(tidyr)
library(rlist)
library(jsonlite)
mydata <- fromJSON("https://api2.cryptoslam.io/api/nft-indexes/NFTGlobal")
#attempt at nested extraction
mydata <- rlist::list.flatten(mydata) %>% dplyr::bind_rows()
mydata <- select(mydata1, contains("totalPriceUSD"))
mydata <- select(mydata1, contains("daily"))
#change row name
rownames(mydata) <- "totalPriceUSD"
names(mydata) <- substring(names(mydata),24,33)
#change col names
names(mydata) <- format(as.Date(names(mydata), format = "%Y-%m-%d"))
mydata1 <- mydata %>%
gather(date, totalPriceUSD)
mydata <- as.data.frame(mydata)
mydata$date <- as.Date(mydata$date, format = "%Y-%m-%d")
As I said, it works, but it ain't pretty. Any suggestions on improving this?
Many thanks
library(dplyr)
mydata <- jsonlite::fromJSON("https://api2.cryptoslam.io/api/nft-indexes/NFTGlobal")
monthly <- bind_rows(lapply(mydata, `[[`, "monthlySummary"), .id = "monthly_id")
daily <- bind_rows(lapply(mydata, function(z) bind_rows(z[["dailySummaries"]], .id = "daily_id")), .id = "monthly_id")
monthly
# # A tibble: 60 x 6
# monthly_id totalTransactions uniqueBuyers uniqueSellers totalPriceUSD isRollingHoursData
# <chr> <int> <int> <int> <dbl> <lgl>
# 1 2017-06 193 33 32 11570. FALSE
# 2 2017-07 613 61 57 89111. FALSE
# 3 2017-08 113 36 31 15133. FALSE
# 4 2017-09 63 22 19 5154. FALSE
# 5 2017-10 52 17 11 3041. FALSE
# 6 2017-11 7259 1077 508 72760. FALSE
# 7 2017-12 265412 53406 23137 18804813. FALSE
# 8 2018-01 30693 7682 4582 1360558. FALSE
# 9 2018-02 34177 4142 4364 2931369. FALSE
# 10 2018-03 29051 3752 2784 987256. FALSE
# # ... with 50 more rows
daily
# # A tibble: 1,750 x 7
# monthly_id daily_id totalTransactions uniqueBuyers uniqueSellers totalPriceUSD isRollingHoursData
# <chr> <chr> <int> <int> <int> <dbl> <lgl>
# 1 2017-06 2017-06-23T00:00:00 27 9 6 1456. FALSE
# 2 2017-06 2017-06-24T00:00:00 15 7 8 846. FALSE
# 3 2017-06 2017-06-25T00:00:00 15 7 5 594. FALSE
# 4 2017-06 2017-06-26T00:00:00 23 10 12 1076. FALSE
# 5 2017-06 2017-06-27T00:00:00 35 8 15 2091. FALSE
# 6 2017-06 2017-06-28T00:00:00 15 6 5 1431. FALSE
# 7 2017-06 2017-06-29T00:00:00 41 13 11 2302. FALSE
# 8 2017-06 2017-06-30T00:00:00 22 11 7 1775. FALSE
# 9 2017-07 2017-07-01T00:00:00 12 7 10 3727. FALSE
# 10 2017-07 2017-07-02T00:00:00 34 13 12 3117. FALSE
# # ... with 1,740 more rows
An alternative to #r2evans answer using rrapply() + unnest_wider(). This should generalize to arbitrary levels of nesting as well.
library(tidyr)
library(jsonlite)
library(rrapply)
mydata <- fromJSON("https://api2.cryptoslam.io/api/nft-indexes/NFTGlobal")
monthly <- rrapply(mydata, classes = "list", condition = \(x, .xname) .xname == "monthlySummary", how = "melt") |>
unnest_wider(value)
daily <- rrapply(mydata, classes = "list", condition = \(x, .xparents) "dailySummaries" %in% head(.xparents, -1), how = "melt") |>
unnest_wider(value)
monthly
#> # A tibble: 60 × 9
#> L1 L2 totalTransactio… uniqueBuyers uniqueSellers totalPriceUSD
#> <chr> <chr> <int> <int> <int> <dbl>
#> 1 2017-06 monthlySum… 193 33 32 11570.
#> 2 2017-07 monthlySum… 613 61 57 89111.
#> 3 2017-08 monthlySum… 113 36 31 15133.
#> 4 2017-09 monthlySum… 63 22 19 5154.
#> 5 2017-10 monthlySum… 52 17 11 3041.
#> 6 2017-11 monthlySum… 7259 1077 508 72760.
#> 7 2017-12 monthlySum… 265412 53406 23137 18804813.
#> 8 2018-01 monthlySum… 30693 7682 4582 1360558.
#> 9 2018-02 monthlySum… 34177 4142 4364 2931369.
#> 10 2018-03 monthlySum… 29051 3752 2784 987256.
#> # … with 50 more rows, and 3 more variables: isRollingHoursData <lgl>,
#> # productNames <lgl>, productNamesWithoutAnySale <lgl>
daily
#> # A tibble: 1,750 × 10
#> L1 L2 L3 totalTransactio… uniqueBuyers uniqueSellers totalPriceUSD
#> <chr> <chr> <chr> <int> <int> <int> <dbl>
#> 1 2017-06 dail… 2017… 27 9 6 1456.
#> 2 2017-06 dail… 2017… 15 7 8 846.
#> 3 2017-06 dail… 2017… 15 7 5 594.
#> 4 2017-06 dail… 2017… 23 10 12 1076.
#> 5 2017-06 dail… 2017… 35 8 15 2091.
#> 6 2017-06 dail… 2017… 15 6 5 1431.
#> 7 2017-06 dail… 2017… 41 13 11 2302.
#> 8 2017-06 dail… 2017… 22 11 7 1775.
#> 9 2017-07 dail… 2017… 12 7 10 3727.
#> 10 2017-07 dail… 2017… 34 13 12 3117.
#> # … with 1,740 more rows, and 3 more variables: isRollingHoursData <lgl>,
#> # productNames <lgl>, productNamesWithoutAnySale <lgl>
I need to create a sequence of numbers based on another column
I have this data frame:
head(df)
id date lc lon lat gap_days gap
1 20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480 NA <NA>
2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap
3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no
4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no
5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no
6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.65486111 gap
I indicated the gaps of more than one day in column "gap", basead on the column gap_days.
Now, I need to split my data. Each sequence of gaps it will be a new individual frame.
So, if I have the ID 20162.03, and this id have one or more gaps, this sequence will be split according the number of gaps.
For this, I will use the package move and the fuctions burst" and split
But, for this I need to create a new column with a sequence of numbers indicate the new separations of ids, as (seq column):
id date lc lon lat gap_days gap seq
1 20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480 NA <NA> 1
2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.65486111 gap 2
7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.30486111 no 2
8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 4
11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250000 no 4
12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625000 no 4
But, as can you see I have a sequencie of "gaps" and "no", but also NA's.
I can't find a solution.
does anyone have a solution?
EDIT:
structure(list(id = c("20162.03", "20162.03", "20162.03", "20162.03",
"20162.03", "20162.03", "20162.03", "20162.03", "20162.03", "20687.03",
"20687.03", "20687.03"), date = structure(c(1066573980, 1066738740,
1066743180, 1066754280, 1066762200, 1066905180, 1066931520, 1067760840,
1067765760, 1067274120, 1067283840, 1067288700), class = c("POSIXct",
"POSIXt"), tzone = "GMT"), lc = structure(c(4L, 1L, 1L, 2L, 2L,
3L, 3L, 3L, 2L, 4L, 6L, 1L), .Label = c("1", "A", "B", "Tagging",
"0", "2", "3", "N", "P", "Z"), class = "factor"), lon = c(-39.37,
-38.517, -38.464, -38.461, -38.322, -38.674, -38.957, -42.084,
-41.999, -39.32, -39.306, -39.301), lat = c(-18.48, -18.253,
-18.302, -18.425, -18.512, -19.824, -19.511, -24.071, -24.114,
-18.46, -18.454, -18.458), gap_days = c(NA, 1.90694444444444,
0.0513888888888889, 0.128472222222222, 0.0916666666666667, 1.65486111111111,
0.304861111111111, 9.59861111111111, 0.0569444444444444, NA,
0.1125, 0.05625), gap = c(NA, "gap", "no", "no", "no", "gap",
"no", "gap", "no", NA, "no", "no")), row.names = c(NA, 12L), class = "data.frame")
A simple solution with Base R:
df$seq <- ave(sapply(df$gap, identical, "gap"), df$id, FUN = cumsum)
df
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.370 -18.480 NA <NA> 0
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.65486111 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.30486111 no 2
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250000 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625000 no 0
And then split it:
split(df, list(df$id, df$seq), drop = TRUE)
#> $`20162.03.0`
#> id date lc lon lat gap_days gap seq
#> 1 20162.03 2003-10-19 14:33:00 Tagging -39.37 -18.48 NA <NA> 0
#>
#> $`20687.03.0`
#> id date lc lon lat gap_days gap seq
#> 10 20687.03 2003-10-27 17:02:00 Tagging -39.320 -18.460 NA <NA> 0
#> 11 20687.03 2003-10-27 19:44:00 2 -39.306 -18.454 0.11250 no 0
#> 12 20687.03 2003-10-27 21:05:00 1 -39.301 -18.458 0.05625 no 0
#>
#> $`20162.03.1`
#> id date lc lon lat gap_days gap seq
#> 2 20162.03 2003-10-21 12:19:00 1 -38.517 -18.253 1.90694444 gap 1
#> 3 20162.03 2003-10-21 13:33:00 1 -38.464 -18.302 0.05138889 no 1
#> 4 20162.03 2003-10-21 16:38:00 A -38.461 -18.425 0.12847222 no 1
#> 5 20162.03 2003-10-21 18:50:00 A -38.322 -18.512 0.09166667 no 1
#>
#> $`20162.03.2`
#> id date lc lon lat gap_days gap seq
#> 6 20162.03 2003-10-23 10:33:00 B -38.674 -19.824 1.6548611 gap 2
#> 7 20162.03 2003-10-23 17:52:00 B -38.957 -19.511 0.3048611 no 2
#>
#> $`20162.03.3`
#> id date lc lon lat gap_days gap seq
#> 8 20162.03 2003-11-02 08:14:00 B -42.084 -24.071 9.59861111 gap 3
#> 9 20162.03 2003-11-02 09:36:00 A -41.999 -24.114 0.05694444 no 3
I have a dataframe where I want to change the column names by matching to another dataframe.
Example dataframe with data and column names:
df <- data.frame("Gene_Symbol" = c("Gene1","Gene2","Gene3","Gene4","Gene5","Gene6","Gene7"),
"Sample1" = c(85657.97656,54417.78906,110949.3281,53197.45313,87156.80469,NA,23880.2832),
"Sample2" = c(10423.40918,41660.73047,40094.54688,49519.78125,129387.1094,NA,23903.25977),
"Sample3" = c(18778.68359,43655.79688,NA,57447.08984,113266.1484,44810.26172,26316.6543),
"Sample4" = c(23919.53125,47829.02344,NA,51478.58203,116275.3359,43110.94922,25417.45508),
"Sample5" = c(NA,46677.20313,63389.45313,48722.15234,NA,77135.52344,40265.6875),
"Sample6" = c(NA,68596.22656,56802.60938,44712.64063,NA,47744.17969,33689.62891),
"Sample7" = c(NA,80506.14844,48722.99219,38629.00781,NA,37885,36638.02344))
The datframe I want to use to exchange from Sample to the Tumor number in df above.
df2 <- data.frame("Sample_name" = c("Sample1","Sample2","Sample3","Sample4","Sample5","Sample6", "Sample7"), "Tumor_name" = c("Tumor56", "Tumor17", "Tumor99", "Tumor2", "Tumor34", "Tumor84", "Tumor51"))
I found a way in dplyr, see below, but it feels very elaborate. Is there an easier way?
library(tidyverse)
df %>%
column_to_rownames("Gene_Symbol")%>% # Bring Gene_Symbol to row name before transpose
t()%>% # Transpose to be able to use join
data.frame()%>% # Transpose makes a matrix - make dataframe again
rownames_to_column("Sample_name")%>% # Bring sample names to column to use join
left_join(., df2, by = "Sample_name", copy = TRUE) %>% # Join by Sample_name column in both data sets
column_to_rownames("Tumor_name")%>% # Bring Tumor names to row name before transpose
select(-Sample_name)%>% # Drop Sample name column
t()%>% # Transpose
data.frame()%>% # Transpose makes a matrix - make dataframe again
rownames_to_column("Gene_Symbol") # Transfer rownames to column again
It would be nice with matching to exchange name, since I can foresee that I will need to do this for subsets of the column names. Looked at rename but could not get it to work.
Also, when I transpose, I get a matrix, why is that?
Grateful for help
Henrik
Here is a tidyverse friendly solution using the !!! splice operator.
library(tidyverse)
# original data set up from stack overflow -------------------------------------
df <- data.frame("Gene_Symbol" = c("Gene1","Gene2","Gene3","Gene4","Gene5","Gene6","Gene7"),
"Sample1" = c(85657.97656,54417.78906,110949.3281,53197.45313,87156.80469,NA,23880.2832),
"Sample2" = c(10423.40918,41660.73047,40094.54688,49519.78125,129387.1094,NA,23903.25977),
"Sample3" = c(18778.68359,43655.79688,NA,57447.08984,113266.1484,44810.26172,26316.6543),
"Sample4" = c(23919.53125,47829.02344,NA,51478.58203,116275.3359,43110.94922,25417.45508),
"Sample5" = c(NA,46677.20313,63389.45313,48722.15234,NA,77135.52344,40265.6875),
"Sample6" = c(NA,68596.22656,56802.60938,44712.64063,NA,47744.17969,33689.62891),
"Sample7" = c(NA,80506.14844,48722.99219,38629.00781,NA,37885,36638.02344))
df2 <- data.frame(
"Sample_name" = c("Sample1","Sample2","Sample3","Sample4","Sample5","Sample6", "Sample7"),
"Tumor_name" = c("Tumor56", "Tumor17", "Tumor99", "Tumor2", "Tumor34", "Tumor84", "Tumor51")
)
# create named vector of variable names ----------------------------------------
# values are current variable names, vector names are the new variable names
var_names <- df2 %>%
# new variable names, old variable names
select(Tumor_name, Sample_name) %>%
deframe()
var_names
#> Tumor56 Tumor17 Tumor99 Tumor2 Tumor34 Tumor84 Tumor51
#> "Sample1" "Sample2" "Sample3" "Sample4" "Sample5" "Sample6" "Sample7"
# rename vaiables---------------------------------------------------------------
df_updated <- df %>%
rename(!!!var_names)
df
#> Gene_Symbol Sample1 Sample2 Sample3 Sample4 Sample5 Sample6
#> 1 Gene1 85657.98 10423.41 18778.68 23919.53 NA NA
#> 2 Gene2 54417.79 41660.73 43655.80 47829.02 46677.20 68596.23
#> 3 Gene3 110949.33 40094.55 NA NA 63389.45 56802.61
#> 4 Gene4 53197.45 49519.78 57447.09 51478.58 48722.15 44712.64
#> 5 Gene5 87156.80 129387.11 113266.15 116275.34 NA NA
#> 6 Gene6 NA NA 44810.26 43110.95 77135.52 47744.18
#> 7 Gene7 23880.28 23903.26 26316.65 25417.46 40265.69 33689.63
#> Sample7
#> 1 NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5 NA
#> 6 37885.00
#> 7 36638.02
df_updated
#> Gene_Symbol Tumor56 Tumor17 Tumor99 Tumor2 Tumor34 Tumor84
#> 1 Gene1 85657.98 10423.41 18778.68 23919.53 NA NA
#> 2 Gene2 54417.79 41660.73 43655.80 47829.02 46677.20 68596.23
#> 3 Gene3 110949.33 40094.55 NA NA 63389.45 56802.61
#> 4 Gene4 53197.45 49519.78 57447.09 51478.58 48722.15 44712.64
#> 5 Gene5 87156.80 129387.11 113266.15 116275.34 NA NA
#> 6 Gene6 NA NA 44810.26 43110.95 77135.52 47744.18
#> 7 Gene7 23880.28 23903.26 26316.65 25417.46 40265.69 33689.63
#> Tumor51
#> 1 NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5 NA
#> 6 37885.00
#> 7 36638.02
Created on 2022-02-24 by the reprex package (v2.0.1)
We could use match
names(df)[-1] <- as.character(df2$Tumor_name[match(names(df)[-1], df2$Sample_name)])
df
# Gene_Symbol Tumor56 Tumor17 Tumor99 Tumor2 Tumor34 Tumor84 Tumor51
#1 Gene1 85657.98 10423.41 18778.68 23919.53 NA NA NA
#2 Gene2 54417.79 41660.73 43655.80 47829.02 46677.20 68596.23 80506.15
#3 Gene3 110949.33 40094.55 NA NA 63389.45 56802.61 48722.99
#4 Gene4 53197.45 49519.78 57447.09 51478.58 48722.15 44712.64 38629.01
#5 Gene5 87156.80 129387.11 113266.15 116275.34 NA NA NA
#6 Gene6 NA NA 44810.26 43110.95 77135.52 47744.18 37885.00
#7 Gene7 23880.28 23903.26 26316.65 25417.46 40265.69 33689.63 36638.02
We could reshape wide-to-long, merge, then reshape again to long-to-wide:
library(dplyr)
library(tidyr)
pivot_longer(df, cols = starts_with("S"), names_to = "Sample_name") %>%
left_join(df2, by = "Sample_name") %>%
pivot_wider(id_cols = Gene_Symbol, names_from = Tumor_name, values_from = value)
## A tibble: 7 x 8
# Gene_Symbol Tumor56 Tumor17 Tumor99 Tumor2 Tumor34 Tumor84 Tumor51
# <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#1 Gene1 85658. 10423. 18779. 23920. NA NA NA
#2 Gene2 54418. 41661. 43656. 47829. 46677. 68596. 80506.
#3 Gene3 110949. 40095. NA NA 63389. 56803. 48723.
#4 Gene4 53197. 49520. 57447. 51479. 48722. 44713. 38629.
#5 Gene5 87157. 129387. 113266. 116275. NA NA NA
#6 Gene6 NA NA 44810. 43111. 77136. 47744. 37885
#7 Gene7 23880. 23903. 26317. 25417. 40266. 33690. 36638.
One approach utilizing dplyr and stringr could be:
df %>%
rename_at(vars(starts_with("Sample")),
~ str_replace_all(., setNames(df2$Tumor_name, df2$Sample_name)))
Gene_Symbol Tumor56 Tumor17 Tumor99 Tumor2 Tumor34 Tumor84 Tumor51
1 Gene1 85657.98 10423.41 18778.68 23919.53 NA NA NA
2 Gene2 54417.79 41660.73 43655.80 47829.02 46677.20 68596.23 80506.15
3 Gene3 110949.33 40094.55 NA NA 63389.45 56802.61 48722.99
4 Gene4 53197.45 49519.78 57447.09 51478.58 48722.15 44712.64 38629.01
5 Gene5 87156.80 129387.11 113266.15 116275.34 NA NA NA
6 Gene6 NA NA 44810.26 43110.95 77135.52 47744.18 37885.00
7 Gene7 23880.28 23903.26 26316.65 25417.46 40265.69 33689.63 36638.02
As another tidyverse solution, you can also use select and all_of instead of rename and the splice operator !!!. Otherwise, this solution is identical to Shannon Pileggi’s excellent solution.
library(tidyverse)
# original data set up from stack overflow -------------------------------------
df <- data.frame("Gene_Symbol" = c("Gene1","Gene2","Gene3","Gene4","Gene5","Gene6","Gene7"),
"Sample1" = c(85657.97656,54417.78906,110949.3281,53197.45313,87156.80469,NA,23880.2832),
"Sample2" = c(10423.40918,41660.73047,40094.54688,49519.78125,129387.1094,NA,23903.25977),
"Sample3" = c(18778.68359,43655.79688,NA,57447.08984,113266.1484,44810.26172,26316.6543),
"Sample4" = c(23919.53125,47829.02344,NA,51478.58203,116275.3359,43110.94922,25417.45508),
"Sample5" = c(NA,46677.20313,63389.45313,48722.15234,NA,77135.52344,40265.6875),
"Sample6" = c(NA,68596.22656,56802.60938,44712.64063,NA,47744.17969,33689.62891),
"Sample7" = c(NA,80506.14844,48722.99219,38629.00781,NA,37885,36638.02344))
df2 <- data.frame(
"Sample_name" = c("Sample1","Sample2","Sample3","Sample4","Sample5","Sample6", "Sample7"),
"Tumor_name" = c("Tumor56", "Tumor17", "Tumor99", "Tumor2", "Tumor34", "Tumor84", "Tumor51")
)
# create named vector of variable names ----------------------------------------
# values are current variable names, vector names are the new variable names
var_names <- df2 %>%
# new variable names, old variable names
select(Tumor_name, Sample_name) %>%
deframe()
var_names
#> Tumor56 Tumor17 Tumor99 Tumor2 Tumor34 Tumor84 Tumor51
#> "Sample1" "Sample2" "Sample3" "Sample4" "Sample5" "Sample6" "Sample7"
# rename variables by using `select` and `all_of` ------------------------------
# note that other variables like `Gene_Symbol` are selected independently
df_updated <- df %>%
select(Gene_Symbol, all_of(var_names))
df
#> Gene_Symbol Sample1 Sample2 Sample3 Sample4 Sample5 Sample6
#> 1 Gene1 85657.98 10423.41 18778.68 23919.53 NA NA
#> 2 Gene2 54417.79 41660.73 43655.80 47829.02 46677.20 68596.23
#> 3 Gene3 110949.33 40094.55 NA NA 63389.45 56802.61
#> 4 Gene4 53197.45 49519.78 57447.09 51478.58 48722.15 44712.64
#> 5 Gene5 87156.80 129387.11 113266.15 116275.34 NA NA
#> 6 Gene6 NA NA 44810.26 43110.95 77135.52 47744.18
#> 7 Gene7 23880.28 23903.26 26316.65 25417.46 40265.69 33689.63
#> Sample7
#> 1 NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5 NA
#> 6 37885.00
#> 7 36638.02
df_updated
#> Gene_Symbol Tumor56 Tumor17 Tumor99 Tumor2 Tumor34 Tumor84
#> 1 Gene1 85657.98 10423.41 18778.68 23919.53 NA NA
#> 2 Gene2 54417.79 41660.73 43655.80 47829.02 46677.20 68596.23
#> 3 Gene3 110949.33 40094.55 NA NA 63389.45 56802.61
#> 4 Gene4 53197.45 49519.78 57447.09 51478.58 48722.15 44712.64
#> 5 Gene5 87156.80 129387.11 113266.15 116275.34 NA NA
#> 6 Gene6 NA NA 44810.26 43110.95 77135.52 47744.18
#> 7 Gene7 23880.28 23903.26 26316.65 25417.46 40265.69 33689.63
#> Tumor51
#> 1 NA
#> 2 80506.15
#> 3 48722.99
#> 4 38629.01
#> 5 NA
#> 6 37885.00
#> 7 36638.02
Created on 2022-08-31 by the reprex package (v2.0.1)
I've got one data frame with the names of variables, and a 1:p index of the order that I'd like them to be in.
I've got a second data frame where the order of these variables is all messed up. How do I take the information from the first to order the columns of the second?
1> key = data.frame(index = 1:6,vars = paste("V",1:6,sep=""))
1> key
index vars
1 1 V1
2 2 V2
3 3 V3
4 4 V4
5 5 V5
6 6 V6
1> set.seed(42)
1> data = data.frame(matrix(rnorm(60),10))
1> colnames(data) = sample(key$vars)
1> data
V3 V6 V5 V2 V4 V1
1 1.37095845 1.3048697 -0.3066386 0.45545012 0.2059986 0.32192527
2 -0.56469817 2.2866454 -1.7813084 0.70483734 -0.3610573 -0.78383894
3 0.36312841 -1.3888607 -0.1719174 1.03510352 0.7581632 1.57572752
4 0.63286260 -0.2787888 1.2146747 -0.60892638 -0.7267048 0.64289931
5 0.40426832 -0.1333213 1.8951935 0.50495512 -1.3682810 0.08976065
6 -0.10612452 0.6359504 -0.4304691 -1.71700868 0.4328180 0.27655075
7 1.51152200 -0.2842529 -0.2572694 -0.78445901 -0.8113932 0.67928882
8 -0.09465904 -2.6564554 -1.7631631 -0.85090759 1.4441013 0.08983289
9 2.01842371 -2.4404669 0.4600974 -2.41420765 -0.4314462 -2.99309008
10 -0.06271410 1.3201133 -0.6399949 0.03612261 0.6556479 0.28488295
data[as.character(key$vars)]
will do the trick.
# V1 V2 V3 V4 V5 V6
# 1 0.32192527 0.45545012 1.37095845 0.2059986 -0.3066386 1.3048697
# 2 -0.78383894 0.70483734 -0.56469817 -0.3610573 -1.7813084 2.2866454
# 3 1.57572752 1.03510352 0.36312841 0.7581632 -0.1719174 -1.3888607
# 4 0.64289931 -0.60892638 0.63286260 -0.7267048 1.2146747 -0.2787888
# 5 0.08976065 0.50495512 0.40426832 -1.3682810 1.8951935 -0.1333213
# 6 0.27655075 -1.71700868 -0.10612452 0.4328180 -0.4304691 0.6359504
# 7 0.67928882 -0.78445901 1.51152200 -0.8113932 -0.2572694 -0.2842529
# 8 0.08983289 -0.85090759 -0.09465904 1.4441013 -1.7631631 -2.6564554
# 9 -2.99309008 -2.41420765 2.01842371 -0.4314462 0.4600974 -2.4404669
# 10 0.28488295 0.03612261 -0.06271410 0.6556479 -0.6399949 1.3201133