Make many rows to single using an index - r

According to this input:
structure(list(mid = c("text11", "text12", "text21", "text22",
"text23"), term = c("test", "text", "section", "2", "sending"
)), class = "data.frame", row.names = c(NA, -5L))
How is it possible to transform it using the mid to make the melt row to a single. where in mid the part text1, text2... text12 shows the number of row and the new number the terms exists in this row. Merging them with a with space separation.
Example out dataframe
data.frame(mid = c("text1", "text2"), term = c("test "text", "section 2 sending"
))

This should work
library(dplyr)
library(stringr)
df <- structure(list(mid = c("text11", "text12", "text21", "text22",
"text23"), term = c("test", "text", "section", "2", "sending"
)), class = "data.frame", row.names = c(NA, -5L))
df %>%
mutate(mid = str_extract(mid, "text\\d")) %>%
group_by(mid) %>%
summarise(term = paste(term, collapse=" "))
# # A tibble: 2 x 2
# mid term
# <chr> <chr>
# 1 text1 test text
# 2 text2 section 2 sending
EDIT - to address comment
Addressing the question in the comment, the functions below will work for any case where all of the digits except the last one identify the group (i.e., 1 and 12 in the example below).
df <- structure(list(mid = c("text11", "text12", "text121", "text122", "text123"), term = c("test", "text", "section", "2", "sending")), class = "data.frame", row.names = c(NA, -5L))
df %>%
mutate(mid = str_sub(mid, 1, (nchar(mid)-1))) %>%
group_by(mid) %>%
summarise(term = paste(term, collapse=" "))
# # A tibble: 2 x 2
# mid term
# <chr> <chr>
# 1 text1 test text
# 2 text12 section 2 sending

Related

How to cbind a list of tables by one column, and suffix headings with the list item name

I've got a list of dataframes. I'd like to cbind them by the index column, sample_id. Each table has the same column headings, so I can't just cbind them otherwise I won't know which list item the columns came from. The name of the list item gives the measure used to generate them, so I'd like to suffix the column headings with the list item name.
Here's a simplified demo list of dataframes:
list_of_tables <- list(number = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(655, 331, 271
), max = c(12, 5, 7)), row.names = c(NA, -3L), class = c("tbl_df",
"tbl", "data.frame")), concentration_cm_3 = structure(list(sample_id = structure(1:3, levels = c("CSF_1",
"CSF_2", "CSF_4"), class = "factor"), total = c(121454697, 90959097,
43080697), max = c(2050000, 2140000, 915500)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), volume_nm_3 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(2412783009, 1293649395, 438426087
), max = c(103500000, 117400000, 23920000)), row.names = c(NA,
-3L), class = c("tbl_df", "tbl", "data.frame")), area_nm_2 = structure(list(
sample_id = structure(1:3, levels = c("CSF_1", "CSF_2", "CSF_4"
), class = "factor"), total = c(15259297.4, 7655352.2, 3775922
), max = c(266500, 289900, 100400)), row.names = c(NA, -3L
), class = c("tbl_df", "tbl", "data.frame")))
You'll see it's a list of 4 tables, and the list item names are "number", "concentration_cm_3", "volume_nm_3", and "area_nm_2".
Using join_all from plyr I can merge them all by sample_id. However, how do I suffix with the list item name?
merged_tables <- plyr::join_all(stats_by_measure, by = "sample_id", type = "left")
we could do it this way:
The trick is to use .id = 'id' in bind_rows which adds the name as a column. Then we could pivot:
library(dplyr)
library(tidyr)
bind_rows(list_of_tables, .id = 'id') %>%
pivot_wider(names_from = id,
values_from = c(total, max))
sample_id total_number total_concentration_cm_3 total_volume_nm_3 total_area_nm_2 max_number max_concentration_cm_3 max_volume_nm_3 max_area_nm_2
<fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 CSF_1 655 121454697 2412783009 15259297. 12 2050000 103500000 266500
2 CSF_2 331 90959097 1293649395 7655352. 5 2140000 117400000 289900
3 CSF_4 271 43080697 438426087 3775922 7 915500 23920000 100400
Probably, we may use reduce2 here with suffix option from left_join
library(dplyr)
library(purrr)
nm <- names(list_of_tables)[1]
reduce2(list_of_tables, names(list_of_tables)[-1],
function(x, y, z) left_join(x, y, by = 'sample_id', suffix = c(nm, z)))
Or if we want to use join_all, probably we can rename the columns before doing the join
library(stringr)
imap(list_of_tables, ~ {
nm <- .y
.x %>% rename_with(~str_c(.x, nm), -1)
}) %>%
plyr::join_all( by = "sample_id", type = "left")
Or use a for loop
tmp <- list_of_tables[[1]]
names(tmp)[-1] <- paste0(names(tmp)[-1], names(list_of_tables)[1])
for(nm in names(list_of_tables)[-1]) {
tmp2 <- list_of_tables[[nm]]
names(tmp2)[-1] <- paste0(names(tmp2)[-1], nm)
tmp <- left_join(tmp, tmp2, by = "sample_id")
}
tmp

pearson correlation for genes in gene expression data

I have two datasets:
one is actual count and other one is predicted counts. I want to do a pearson correlation between them.
My actual count data look like this:
My predicted counts data look like this:
I want to do pearson correlation for these two datasets for each geneID.
I have written this code:
install.packages("Rcpp")
library(Rcpp)
library("reshape2")
library("ggplot2")
# import in the actual expression values and the gene predicted values
act_cts <- read.delim("GVDS_normalized_counts_2021v1.txt", header = TRUE, sep="\t")
## fix the column names
colnames(act_cts)[1]<-"gene"
colnames(act_cts)<- substr(colnames(act_cts), 1, 7)
pred_cts<-read.delim("GVDS_PrediXcan_Test_2021v1.txt", header=TRUE, sep="\t")
colnames(pred_cts)<-substr(colnames(pred_cts), 1, 15)
## melt the predict counts, so the columns change to row entries FID, IID, gene
melt_pred_cts<-melt(pred_cts, id.vars=c("FID","IID"), variable.name="gene", value.name = "gene_exp")
## melts the actual counts, so it can be easily joined to the final prediction
melt_act_cts<-melt(act_cts, id.vars="gene", variable.name="IID", value.name = "act_gene_exp")
final_cts<-merge(melt_pred_cts,melt_act_cts)
## this takes a minute/ several minutes to run because it is joining on both gene and IID
# runs the Pearson correlation for each gene
all_genes<-unique(final_cts$gene)
pear_cor_all_df<- data.frame(gene=character(), pear_coeff=double())
## runs the correlation
for(g in all_genes)
{
wrk_cts_all<-final_cts[which(final_cts$gene==g),]
# temp working df for each gene
pear_coef_all<-cor(wrk_cts_all$gene_exp, wrk_cts_all$act_gene_exp, method="pearson")
# runs the correlation for each gene between gene_exp and act_gene_exp
new_row_all<-c(g, pear_coef_all)
pear_cor_all_df<-rbind(pear_cor_all_df, new_row_all)
#saves this to the df
}
But its not giving me the correct results.
This is data for act_count:
dput(act_counts[1:10, 1:10])
structure(list(gene = c("ENSG00000152931.6", "ENSG00000183696.9",
"ENSG00000139269.2", "ENSG00000169129.8", "ENSG00000134602.11",
"ENSG00000136237.12", "ENSG00000259425.1", "ENSG00000242284.2",
"ENSG00000235027.1", "ENSG00000228169.3"), Gene_Sy = c("ENSG00000152931.6",
"ENSG00000183696.9", "ENSG00000139269.2", "ENSG00000169129.8",
"ENSG00000134602.11", "ENSG00000136237.12", "ENSG00000259425.1",
"ENSG00000242284.2", "ENSG00000235027.1", "ENSG00000228169.3"
), Chr = c("5", "7", "12", "10", "X", "7", "15", "X", "11", "10"
), Coord = c(59783540, 48128225, 57846106, 116164515, 131157293,
22396763, 23096869, 134953994, 1781578, 116450393), HG00096 = c(0.101857770468582,
8.1838049456063, 1.19991028786682, 0.831939826228749, 27.6464223725999,
3.78850273139249, 0.0540590649819536, 0.351716382898523, 0.200791414339667,
96.1821778045089), HG00097 = c(0.0781095249582053, 5.68691050653862,
1.57357169691446, 0.0697777450667378, 24.3955715036476, 2.05096276937706,
0.112185357489692, 0.444540251941709, 0.190137938062251, 101.17926156721
), HG00099 = c(0.0489806714207954, 2.43465332606958, 0.521615781673147,
0.93108575037257, 16.4453735152148, 4.00031300285966, 0.00359181983091798,
0.227707651999832, 0.0929246302159905, 58.7830634918037), HG00100 = c(0.118597118618172,
3.83089421985197, 1.44722544015787, 0.620940765480242, 24.8066495438254,
3.27161920134705, 0.00049968321150251, 0.714112406249513, 0.108789749488722,
105.483527339859), HG00101 = c(0.00403496367614745, 6.61228835251498,
3.56579072437701, 1.66066836204679, 25.1133488775017, 1.79821591847768,
0.0293976115522442, 0.450911709524112, 0.23244822901371, 105.818192023699
), HG00102 = c(0.0109253485646219, 4.70964559086586, 1.98268073472144,
0.570481056180073, 19.2339882617972, 1.51668840574531, 0.0312661751488703,
0.491437808951175, 0.250905117203001, 136.140843495464)), row.names = c(NA,
-10L), class = c("tbl_df", "tbl", "data.frame"))
This is prd_counts:
dput(prd_counts[1:10, 1:10])
structure(list(FID = c("HG00096", "HG00097", "HG00099", "HG00100",
"HG00101", "HG00102", "HG00103", "HG00105", "HG00106", "HG00107"
), IID = c("HG00096", "HG00097", "HG00099", "HG00100", "HG00101",
"HG00102", "HG00103", "HG00105", "HG00106", "HG00107"), ENSG00000182902.8 = c(0.0223611610092831,
0.0385031316687293, -0.0682504384265577, 0.00018098416274239,
-0.045492721345375, -0.10473163051734, -0.0215970711860838, 0.060455638944161,
-0.00889260689717109, -0.102096211855105), ENSG00000183307.3 = c(0.129041336028238,
-0.13226906002202, 0.005409246530295, -0.0539556427088601, -0.00699884042001628,
-0.204743560777908, -0.0534359750800079, -0.235648260835705,
-0.10230402771496, -0.0914043464852205), ENSG00000237438.1 = c(-0.758838434524167,
-0.579236418964912, -0.695762357174973, -0.368416879945024, -0.339555280234214,
-0.809438763600528, -0.359798980325098, -0.417769387016999, -0.724636782037491,
-0.309671271758401), ENSG00000243156.2 = c(-0.58456094489168,
0.105851861253113, -0.275061563982305, -0.0406543077034047, -0.522672785138957,
-0.126100301787985, -0.288382571274346, -0.354309857822533, -0.314842662063296,
-0.141401921597711), ENSG00000099968.13 = c(0.135357355615122,
0.157616292043257, 0.180059097593111, 0.250009792099489, 0.170653230854707,
0.316157576642492, 0.314671674077333, 0.224102148083679, 0.232969333848649,
0.14963210689311), ENSG00000069998.8 = c(-0.0346986034383362,
-0.0173493017191681, 0, -0.0173493017191681, -0.645266014640116,
-0.0346986034383362, -0.0173493017191681, -0.0173493017191681,
-0.0346986034383362, 0), ENSG00000184979.8 = c(-0.160573318589815,
0.54683218159596, 0.3503062647549, 0.653899917577768, 0.321280544783323,
0.653727041876318, 0.822864620159811, 1.03780221621802, -0.195295753744408,
-0.228590172992798), ENSG00000070413.12 = c(0.775225873145799,
0.602092262450708, 1.0198591935485, 0.65587457098494, 0.306445027670957,
0.581202299884586, 0.836112660742631, 0.559373823767867, 0.46977171007116,
0.84426113999649)), row.names = c(NA, -10L), class = c("tbl_df",
"tbl", "data.frame"))
The provided test samples will not work because there are no genes in common between act_counts and prd_counts. I took the liberty of fixing that by reassigning column names:
library(dplyr)
library(tidyr)
## the line below fixes the problem with test samples
colnames(prd_counts)[3:10] <- act_counts$gene[1:8]
acts <- pivot_longer(act_counts,
cols = starts_with("HG"),
names_to = "FID",
values_to = "Actual")
prds <- pivot_longer(prd_counts,
cols = starts_with("ENSG"),
names_to = "gene",
values_to = "Predicted")
inner_join(acts, prds,
by = c("gene", "FID")) |>
select(gene, FID, Actual, Predicted) |>
group_by(gene) |>
summarize(rho = cor(Actual, Predicted))
##> # A tibble: 8 × 2
##> gene rho
##> <chr> <dbl>
##> 1 ENSG00000134602.11 -0.445
##> 2 ENSG00000136237.12 0.446
##> 3 ENSG00000139269.2 0.543
##> 4 ENSG00000152931.6 0.770
##> 5 ENSG00000169129.8 -0.802
##> 6 ENSG00000183696.9 0.405
##> 7 ENSG00000242284.2 -0.503
##> 8 ENSG00000259425.1 -0.110

How to convert MM::SS string to time format and then manipulate it

I have the following data frame:
library(tibble)
dat <- structure(list(title = c("Title A", "Title B", "Title C"), tm = c(
"31:17",
"30:28", "32:06"
)), row.names = c(NA, -3L), spec = structure(list(
cols = list(title = structure(list(), class = c(
"collector_character",
"collector"
)), tm = structure(list(), class = c(
"collector_character",
"collector"
))), default = structure(list(), class = c(
"collector_guess",
"collector"
)), delim = ","
), class = "col_spec"), class = c(
"spec_tbl_df",
"tbl_df", "tbl", "data.frame"
))
It looks like this:
> dat
# A tibble: 3 × 2
title tm
<chr> <chr>
1 Title A 31:17
2 Title B 30:28
3 Title C 32:06
The second column is time but as character. What I want to do is to
add 5 minutes to it for every row; resulting in:
Title A 36:17
Title B 35:28
Title C 37:06
The strategy is to convert tm into numeric form first. But I failed with this step
dat %>%
mutate(nt = hms::as_hms(as.numeric(tm) + 5))
It gave me:
title tm nt
<chr> <chr> <time>
1 Title A 31:17 NA
2 Title B 30:28 NA
3 Title C 32:06 NA
What's the right way to do it?
Using lubridate -
library(dplyr)
library(lubridate)
dat %>%
mutate(tm = ms(tm) + minutes(5),
tm_string = sprintf('%02d:%02d', tm#minute, tm#.Data))
# title tm tm_string
# <chr> <Period> <chr>
#1 Title A 36M 17S 36:17
#2 Title B 35M 28S 35:28
#3 Title C 37M 6S 37:06

How to create a row by dividing First row by third row

I have a dataset which has values in first row & total in third row. I want to create a fourth row which is percentage of first by total which can be done by dividing first row with fourth row.
below is structure of dataframe
ds = structure(list(t1 = structure(c("1", "2", "Total"), label = "currently smoke any tobacco product", labels = c(no = 0,
yes = 1), class = "haven_labelled"), c1Female = c(679357.516868591,
8394232.81394577, 9073590.33081436), c1Male = c(2254232.8617363,
5802560.20343018, 8056793.06516647), se.c1Female = c(63743.4459540534,
421866.610586848, 485610.056540901), se.c1Male = c(185544.754820322,
386138.725133411, 571683.479953732), Total_1 = c(`1` = 2933590.37860489,
`2` = 14196793.0173759, `3` = 17130383.3959808), per = c(`1` = 0.171250713471665,
`2` = 0.828749286528335, `3` = 1)), class = "data.frame", row.names = c(NA,
-3L))
My try & what is wrong with this
ds %>% mutate(percentage = .[1,]/.[3,])
OUTPUT SHOULD BE : Below is the dput of Output Dataframe that I want
structure(list(t1 = structure(c(1L, 2L, 4L, 3L), .Label = c("1",
"2", "Percentage", "Total"), class = "factor"), c1Female = c(679357.517,
8394232.814, 9073590.331, 0.074871963), c1Male = c(2254232.86,
5802560.2, 8056793.07, 0.279792821), se.c1Female = c(63743.446,
421866.611, 485610.057, 0.131264674), se.c1Male = c(185544.755,
386138.725, 571683.48, 0.324558539), Total_1 = c(2933590.38,
14196793.02, 17130383.4, 0.171250714), per = c(0.171250713, 0.828749287,
1, 0.171250713)), class = "data.frame", row.names = c(NA, -4L
))
Do share the tidyverse way to do this. Also, do tell what is wrong with this approach below line code
ds %>% mutate(percentage = .[1,]/.[3,])
We can use summarise_at to divide multiple column values to return a single row and then bind with the original dataset
library(dplyr)
ds %>%
summarise_at(-1, ~ .[1]/.[3]) %>%
mutate(t1 = 'Percentage') %>%
bind_rows(ds, .)
# t1 c1Female c1Male se.c1Female se.c1Male Total_1 per
#1 1 6.793575e+05 2.254233e+06 6.374345e+04 1.855448e+05 2.933590e+06 0.1712507
#2 2 8.394233e+06 5.802560e+06 4.218666e+05 3.861387e+05 1.419679e+07 0.8287493
#3 Total 9.073590e+06 8.056793e+06 4.856101e+05 5.716835e+05 1.713038e+07 1.0000000
#4 Percentage 7.487196e-02 2.797928e-01 1.312647e-01 3.245585e-01 1.712507e-01 0.1712507
Or another option is add_row
ds %>%
add_row(t1 = 'Percentage') %>%
mutate_at(-1, ~ replace_na(., .[1]/.[3]))
Or do this within the add_row step itself
ds %>%
add_row(t1 = 'Percentage', !!!as.list(.[-1][1,]/.[-1][3,]))
# t1 c1Female c1Male se.c1Female se.c1Male Total_1 per
#1 1 6.793575e+05 2.254233e+06 6.374345e+04 1.855448e+05 2.933590e+06 0.1712507
#2 2 8.394233e+06 5.802560e+06 4.218666e+05 3.861387e+05 1.419679e+07 0.8287493
#3 Total 9.073590e+06 8.056793e+06 4.856101e+05 5.716835e+05 1.713038e+07 1.0000000
#4 Percentage 7.487196e-02 2.797928e-01 1.312647e-01 3.245585e-01 1.712507e-01 0.1712507

text cleaning in R

I have a single column in R that looks like this:
Path Column
ag.1.4->ao.5.5->iv.9.12->ag.4.35
ao.11.234->iv.345.455.1.2->ag.9.531
I want to transform this into:
Path Column
ag->ao->iv->ag
ao->iv->ag
How can I do this?
Thank you
Here is my full dput from my data:
structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122), Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")), .Names = c("Rank",
"Count", "Percent", "Path"), class = "data.frame", row.names = c(NA,
-2L))
You could use gsub to match the . and numbers following the . (\\.[0-9]+) and replace it with ''.
df1$Path.Column <- gsub('\\.[0-9]+', '', df1$Path.Column)
df1
# Path.Column
#1 ag -> ao -> iv -> ag
#2 ao -> iv -> ag
Update
For the new dataset df2
gsub('\\.[^->]+(?=(->|\\b))', '', df2$Path, perl=TRUE)
#[1] "ao->ao->ao" "ao->agent"
and for the string showed in the OP's post
str2 <- c('ag.1.4->ao.5.5->iv.9.12->ag.4.35',
'ao.11.234->iv.345.455.1.2->ag.9.531')
gsub('\\.[^->]+(?=(->|\\b))', '', str2, perl=TRUE)
#[1] "ag->ao->iv->ag" "ao->iv->ag"
data
df1 <- structure(list(Path.Column = c("ag.1 -> ao.5 -> iv.9 -> ag.4",
"ao.11 -> iv.345 -> ag.9")), .Names = "Path.Column",
class = "data.frame", row.names = c(NA, -2L))
df2 <- structure(list(Rank = c(10394749L, 36749879L), Count = c(1L,
1L), Percent = c(0.001011122, 0.001011122),
Path = c("ao.legacy payment.not_completed->ao.legacy payment.not_completed->ao.legacy payment.completed",
"ao.legacy payment.not_completed->agent.payment.completed")),
.Names = c("Rank", "Count", "Percent", "Path"), class = "data.frame",
row.names = c(NA, -2L))
It may be easeir to split the strings on '->' and process the substrings separately
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove eveything after **first** the dot
subStrings<- lapply(subStrings,
function(x)gsub('\\..*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao->ao->ao" "ao->agent"
or
# split the stirngs into parts
subStrings <- strsplit(df$Path,'->')
# remove the parts of the identifiers after the dot
subStrings<- lapply(subStrings,
function(x)gsub('\\.[^ \t]*','',x))
# paste them back together.
sapply(subStrings,paste0,collapse="->")
#> "ao payment->ao payment->ao payment" "ao payment->agent"

Resources