Regex to catch similar matching word until it hits a number - r

I have this df:
data1 <- structure(list(attr = c("kind1", "kind2", "kind3", "price1",
"price2", "packing1", "weight1", "weight2", "calorie1"), coef = c(-1.08908045977012,
-0.732758620689656, -0.922413793103449, -0.570881226053641, 0.118773946360153,
-0.0287356321839081, -0.168582375478927, 0.173371647509578, -0.646551724137931
), pval = c(0.0000000461586619475345, 0.000225855110699109, 0.00000354973103147522,
0.000189625500287816, 0.506777189443937, 0.801713589134903, 0.269271977099465,
0.33257496253009, 0.0000000192904668116847)), row.names = c(NA,
-9L), class = "data.frame")
# attr coef pval
#1 kind1 -1.08908046 0.00000004615866
#2 kind2 -0.73275862 0.00022585511070
#3 kind3 -0.92241379 0.00000354973103
#4 price1 -0.57088123 0.00018962550029
#5 price2 0.11877395 0.50677718944394
#6 packing1 -0.02873563 0.80171358913490
#7 weight1 -0.16858238 0.26927197709946
#8 weight2 0.17337165 0.33257496253009
#9 calorie1 -0.64655172 0.00000001929047
I'm trying to add by groups according to a regex that identifies similar words up to a certain point, in this case, until a number appears.
For example, in the case of my variables, there would be 5 groups:
kind
Total = kind sum
price
Total = price sum
packing
Total= packing sum
weight
Total = weight sum
calorie
Total = calorie sum
I made this code, but I don't know how to position this regex or how to create it. I tried using stringr but I couldn't do what I want:
data1 %>%
dplyr::arrange(attr) %>%
split(f = .[,"attr"]) %>%
purrr::map_df(., janitor::adorn_totals)
# attr coef pval
# calorie1 -0.64655172 0.00000001929047
# Total -0.64655172 0.00000001929047
# kind1 -1.08908046 0.00000004615866
# Total -1.08908046 0.00000004615866
# kind2 -0.73275862 0.00022585511070
# Total -0.73275862 0.00022585511070
# kind3 -0.92241379 0.00000354973103
# Total -0.92241379 0.00000354973103
# packing1 -0.02873563 0.80171358913490
# Total -0.02873563 0.80171358913490
# price1 -0.57088123 0.00018962550029
# Total -0.57088123 0.00018962550029
# price2 0.11877395 0.50677718944394
# Total 0.11877395 0.50677718944394
# weight1 -0.16858238 0.26927197709946
# Total -0.16858238 0.26927197709946
# weight2 0.17337165 0.33257496253009
# Total 0.17337165 0.33257496253009
It sums individual rows as groups differ by number. I need a regex that captures this:
kind
price
packing
weight
calorie
That is, to capture the letters until a number appears there.

You can create a grouping variable by removing the digits from the attr variable, and then use group_modify:
data1 %>%
group_by(grp = str_remove_all(attr, "[0-9]")) %>%
group_modify(janitor::adorn_totals, where = "row") %>%
ungroup() %>%
select(-grp)
# # A tibble: 14 × 3
# attr coef pval
# <chr> <dbl> <dbl>
# 1 calorie1 -0.647 0.0000000193
# 2 Total -0.647 0.0000000193
# 3 kind1 -1.09 0.0000000462
# 4 kind2 -0.733 0.000226
# 5 kind3 -0.922 0.00000355
# 6 Total -2.74 0.000229
# 7 packing1 -0.0287 0.802
# 8 Total -0.0287 0.802
# 9 price1 -0.571 0.000190
# 10 price2 0.119 0.507
# 11 Total -0.452 0.507
# 12 weight1 -0.169 0.269
# 13 weight2 0.173 0.333
# 14 Total 0.00479 0.602

Something like this:
We could use group_split() after extract the words to identify. Then we get a list. Here we now can iterate with map_df the function adorn_totals:
library(tidyverse)
library(janitor)
data1 %>%
group_split(id=str_extract(attr, '[A-Za-z]+')) %>%
map_dfr(., adorn_totals) %>%
select(-id) %>%
as_tibble()
attr coef pval
<chr> <dbl> <dbl>
1 calorie1 -0.647 0.0000000193
2 Total -0.647 0.0000000193
3 kind1 -1.09 0.0000000462
4 kind2 -0.733 0.000226
5 kind3 -0.922 0.00000355
6 Total -2.74 0.000229
7 packing1 -0.0287 0.802
8 Total -0.0287 0.802
9 price1 -0.571 0.000190
10 price2 0.119 0.507
11 Total -0.452 0.507
12 weight1 -0.169 0.269
13 weight2 0.173 0.333
14 Total 0.00479 0.602

Related

How to consolidate rows based on whether they share the same value in a key column?

I am working with an R script that does not accept duplicates in a certain column because that column gets assigned to the rownames of the data.frame. Many of the rows of the data frame that I am working with, however, contain duplicate genes (gene_id) and normalized expression values because they were sequenced in an exonic region of the same gene along the transcriptome.
> data.table(df) %>%
+ group_by(gene_id)
# A tibble: 138,930 x 544
# Groups: gene_id [22,672]
`#chr` start end gene_id `XXXXX… `XXXXX…
<fct> <int> <int> <fct> <dbl> <dbl>
1 chr20 290428 290429 ENSG0000019647… 0.830 2.54
2 chr20 290748 290749 ENSG0000019647… 0.830 2.54
3 chr20 290777 290778 ENSG0000019647… 0.830 2.54
4 chr20 296967 296968 ENSG0000024731… -0.0533 0.308
5 chr20 325233 325234 ENSG0000022537… -0.299 -0.274
6 chr20 325594 325595 ENSG0000017773… 0.246 1.98
7 chr20 346781 346782 ENSG0000012584… -0.156 -1.06
8 chr20 346882 346883 ENSG0000012584… -0.156 -1.06
9 chr20 347023 347024 ENSG0000012584… -0.156 -1.06
10 chr20 347104 347105 ENSG0000012584… -0.156 -1.06
I'd like to consolidate, for example, rows 1-3 and keep the lowest start value and the highest end value, and only get 1 resulting row. However, I don't know how to do this in dplyr other than to start by group_by the common column. What do you suggest/how would I go from here?
We can use slice_head with n = 1 after updating the 'start', 'end' columns by the min and max (or first and last - if ordered) on the grouped data
library(dplyr)
df %>%
group_by(gene_id) %>%
mutate(start = min(start), end = max(end)) %>%
slice_head(n = 1) %>%
ungroup
Or use distinct
df %>%
group_by(gene_id) %>%
mutate(start = min(start), end = max(end)) %>%
ungroup %>%
distinct(gene_id, .keep_all = TRUE)

row bind list columns using dplyr

I would like to find a better way to bind together the results of any number of regressions after adding an identifier for each model. The code below is my current solution but is too manual for a large number of regressions. This is part of a larger tidy workflow so a solution inside of the tidyverse is preferred but whatever works is fine. Thanks
library(tidyverse)
library(broom)
model_dat=mtcars %>%
do(lm_1 = tidy(lm(disp~ wt*vs, data = .),conf.int=T),
lm_2=tidy(lm(cyl ~ wt*vs, data = .),conf.int=T ),
lm_3=tidy(lm(mpg ~ wt*vs, data = .),conf.int=T ))
df=model_dat %>%
select(lm_1) %>%
unnest(c(lm_1)) %>%
mutate(model="one") %>%
select(model,term,estimate,p.value:conf.high) %>%
bind_rows(
model_dat %>%
select(lm_2) %>%
unnest(c(lm_2)) %>%
mutate(model="two") %>%
select(model,term,estimate,p.value:conf.high)) %>%
bind_rows(
model_dat %>%
select(lm_3) %>%
unnest(c(lm_3)) %>%
mutate(model="three") %>%
select(model,term,estimate,p.value:conf.high))
It may be easier with map2 i.e. loop across the columns and the corresponding english word for the sequence of columns, pluck the list element, create the 'model' column with second argument i.e. engish words (.y), select the columns of interest, and create a single dataset by specifying _dfr in map
library(purrr)
library(english)
library(dplyr)
library(broom)
map2_dfr(model_dat, as.character(english(seq_along(model_dat))),
~ .x %>%
pluck(1) %>%
mutate(model = .y) %>%
select(model, term, estimate, p.value:conf.high) )
-output
# A tibble: 12 x 6
# model term estimate p.value conf.low conf.high
# <chr> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 one (Intercept) -70.0 1.55e- 1 -168. 28.2
# 2 one wt 102. 8.20e- 9 76.4 128.
# 3 one vs 31.2 6.54e- 1 -110. 172.
# 4 one wt:vs -36.7 1.10e- 1 -82.2 8.82
# 5 two (Intercept) 4.31 1.28e- 5 2.64 5.99
# 6 two wt 0.849 4.90e- 4 0.408 1.29
# 7 two vs -2.19 7.28e- 2 -4.59 0.216
# 8 two wt:vs 0.0869 8.20e- 1 -0.689 0.862
# 9 three (Intercept) 29.5 6.55e-12 24.2 34.9
#10 three wt -3.50 2.33e- 5 -4.92 -2.08
#11 three vs 11.8 4.10e- 3 4.06 19.5
#12 three wt:vs -2.91 2.36e- 2 -5.40 -0.419
Or use summarise with across, unclass and then bind with bind_rows
model_dat %>%
summarise(across(everything(), ~ {
# // get the column name
nm1 <- cur_column()
# // extract the list element (.[[1]])
list(.[[1]] %>%
# // create new column by extracting the numeric part
mutate(model = english(readr::parse_number(nm1))) %>%
# // select the subset of columns, wrap in a list
select(model, term, estimate, p.value:conf.high))
}
)) %>%
# // unclass to list
unclass %>%
# // bind the list elements
bind_rows
-output
# A tibble: 12 x 6
# model term estimate p.value conf.low conf.high
# <english> <chr> <dbl> <dbl> <dbl> <dbl>
# 1 one (Intercept) -70.0 1.55e- 1 -168. 28.2
# 2 one wt 102. 8.20e- 9 76.4 128.
# 3 one vs 31.2 6.54e- 1 -110. 172.
# 4 one wt:vs -36.7 1.10e- 1 -82.2 8.82
# 5 two (Intercept) 4.31 1.28e- 5 2.64 5.99
# 6 two wt 0.849 4.90e- 4 0.408 1.29
# 7 two vs -2.19 7.28e- 2 -4.59 0.216
# 8 two wt:vs 0.0869 8.20e- 1 -0.689 0.862
# 9 three (Intercept) 29.5 6.55e-12 24.2 34.9
#10 three wt -3.50 2.33e- 5 -4.92 -2.08
#11 three vs 11.8 4.10e- 3 4.06 19.5
#12 three wt:vs -2.91 2.36e- 2 -5.40 -0.419

tidyr::spread() with multiple keys and values

I assume this has been asked multiple times but I couldn't find the proper words to find a workable solution.
How can I spread() a data frame based on multiple keys for multiple values?
A simplified (I have many more columns to spread, but on only two keys: Id and time point of a given measurement) data I'm working with looks like this:
df <- data.frame(id = rep(seq(1:10),3),
time = rep(1:3, each=10),
x = rnorm(n=30),
y = rnorm(n=30))
> head(df)
id time x y
1 1 1 -2.62671241 0.01669755
2 2 1 -1.69862885 0.24992634
3 3 1 1.01820778 -1.04754037
4 4 1 0.97561596 0.35216040
5 5 1 0.60367158 -0.78066767
6 6 1 -0.03761868 1.08173157
> tail(df)
id time x y
25 5 3 0.03621258 -1.1134368
26 6 3 -0.25900538 1.6009824
27 7 3 0.13996626 0.1359013
28 8 3 -0.60364935 1.5750232
29 9 3 0.89618748 0.0294315
30 10 3 0.14709567 0.5461084
What i'd like to have is a dataframe populated like this:
One row per Id columns for each value from the time and each measurement variable.
With the devel version of tidyr (tidyr_0.8.3.9000), we can use pivot_wider to reshape multiple value columns from long to wide format
library(dplyr)
library(tidyr)
library(stringr)
df %>%
mutate(time = str_c("time", time)) %>%
pivot_wider(names_from = time, values_from = c("x", "y"), names_sep="")
# A tibble: 10 x 7
# id xtime1 xtime2 xtime3 ytime1 ytime2 ytime3
# <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 1 -0.256 0.483 -0.254 -0.652 0.655 0.291
# 2 2 1.10 -0.596 -1.85 1.09 -0.401 -1.24
# 3 3 0.756 -2.19 -0.0779 -0.763 -0.335 -0.456
# 4 4 -0.238 -0.675 0.969 -0.829 1.37 -0.830
# 5 5 0.987 -2.12 0.185 0.834 2.14 0.340
# 6 6 0.741 -1.27 -1.38 -0.968 0.506 1.07
# 7 7 0.0893 -0.374 -1.44 -0.0288 0.786 1.22
# 8 8 -0.955 -0.688 0.362 0.233 -0.902 0.736
# 9 9 -0.195 -0.872 -1.76 -0.301 0.533 -0.481
#10 10 0.926 -0.102 -0.325 -0.678 -0.646 0.563
NOTE: The numbers are different as there was no set seed while creating the sample dataset
Reshaping with multiple value variables can best be done with dcast from data.table or reshape from base R.
library(data.table)
out <- dcast(setDT(df), id ~ paste0("time", time), value.var = c("x", "y"), sep = "")
out
# id xtime1 xtime2 xtime3 ytime1 ytime2 ytime3
# 1: 1 0.4334921 -0.5205570 -1.44364515 0.49288757 -1.26955148 -0.83344256
# 2: 2 0.4785870 0.9261711 0.68173681 1.24639813 0.91805332 0.34346260
# 3: 3 -1.2067665 1.7309593 0.04923993 1.28184341 -0.69435556 0.01609261
# 4: 4 0.5240518 0.7481787 0.07966677 -1.36408357 1.72636849 -0.45827205
# 5: 5 0.3733316 -0.3689391 -0.11879819 -0.03276689 0.91824437 2.18084692
# 6: 6 0.2363018 -0.2358572 0.73389984 -1.10946940 -1.05379502 -0.82691626
# 7: 7 -1.4979165 0.9026397 0.84666801 1.02138768 -0.01072588 0.08925716
# 8: 8 0.3428946 -0.2235349 -1.21684977 0.40549497 0.68937085 -0.15793111
# 9: 9 -1.1304688 -0.3901419 -0.10722222 -0.54206830 0.34134397 0.48504564
#10: 10 -0.5275251 -1.1328937 -0.68059800 1.38790593 0.93199593 -1.77498807
Using reshape we could do
# setDF(df) # in case df is a data.table now
reshape(df, idvar = "id", timevar = "time", direction = "wide")
Your entry data frame is not tidy. You should use gather to make it so.
gather(df, key, value, -id, -time) %>%
mutate(key = paste0(key, "time", time)) %>%
select(-time) %>%
spread(key, value)

R find top n results of column operation on aggregate operation per column over dataframe

Say I have a dataframe called RaM that holds cumulative return values. In this case, they literally are just a single row of cumulative return values along with column headers, but I would like to apply the logic to not just single row dataframes.
Say I want to sort by the max cumulative return value of each column, or even the average, or the sum of each column.
So each column would be re-ordered so that the max cumulative returns for each column is compared and the highest return becomes the 1st column with the min being the last column
then say I want to derive either the top 10 (1st 10 columns after they are rearranged), or even the top 10%.
I know how to derive the column averages, but I don't know how to effectively do the remaining operations. There is an order function, but when I used it, it stripped my column names, which I need. I could easily then cut the 1st say 10 columns, but is there a way that preserves the names? I don't think I can easily extract the names from the unordered original dataframe and apply it with my sorted by aggregate dataframe. My goal is to extract the column names of the top n columns (in dataframe RaM) in terms of a column aggregate function over the entire dataframe.
something like
top10 <- getTop10ColumnNames(colSums(RaM))
that would then output a dataframe of the top 10 columns in terms of their sum from RaM
Here's output off RaM
> head(RaM,2)
ABMD ACAD ALGN ALNY ANIP ASCMA AVGO CALD CLVS CORT
2013-01-31 0.03794643 0.296774194 0.13009009 0.32219178 0.13008130 0.02857604 0.13014640 -0.07929515 0.23375000 0.5174825
2013-02-28 0.14982079 0.006633499 0.00255102 -0.01823456 -0.05755396 0.07659708 -0.04333138 0.04066986 -0.04457953 -0.2465438
CPST EA EGY EXEL FCSC FOLD GNC GTT HEAR HK HZNP
2013-01-31 -0.05269663 0.08333333 -0.01849711 0.01969365 0 0.4179104 0.07992677 0.250000000 0.2017417 0.10404624 -0.085836910
2013-02-28 0.15051595 0.11443102 -0.04475854 -0.02145923 0 -0.2947368 0.14079036 0.002857143 0.4239130 -0.07068063 -0.009389671
ICON IMI IMMU INFI INSY KEG LGND LQDT MCF MU
2013-01-31 0.07750896 0.05393258 -0.01027397 -0.01571429 -0.05806459 0.16978417 -0.03085824 -0.22001958 0.01345609 0.1924290
2013-02-28 -0.01746362 0.03091684 -0.20415225 0.19854862 0.36849503 0.05535055 0.02189055 0.06840289 -0.09713487 0.1078042
NBIX NFLX NVDA OREX PFPT PQ PRTA PTX RAS REXX RTRX
2013-01-31 0.2112299 0.7846467 0.00000000 0.08950306 0.06823721 0.03838384 -0.1800819 0.04387097 0.23852335 0.008448541 0.34328358
2013-02-28 0.1677704 0.1382251 0.03888981 0.04020979 0.06311787 -0.25291829 0.0266223 -0.26328801 0.05079882 0.026656512 -0.02222222
SDRL SHOS SSI STMP TAL TREE TSLA TTWO UVE VICL
2013-01-31 0.07826093 0.2023956 -0.07788381 0.07103175 -0.14166875 -0.030504714 0.10746974 0.1053588 0.0365299 0.2302405
2013-02-28 -0.07585546 0.1384419 0.08052150 -0.09633197 0.08009728 -0.002860412 -0.07144761 0.2029581 -0.0330408 -0.1061453
VSI VVUS WLB
2013-01-31 0.06485356 -0.0976155 0.07494647
2013-02-28 -0.13965291 -0.1156069 0.04581673
Here's one way using the first section of your sample data to illustrate. You can gather up all the columns so that we can do summary calculations more easily, calculate all the summaries by group that you want, and then sort with arrange. Here I ordered with the highest sums first, but you could do whatever order you wanted.
library(tidyverse)
ram <- read_table2(
"ABMD ACAD ALGN ALNY ANIP ASCMA AVGO CALD CLVS CORT
0.03794643 0.296774194 0.13009009 0.32219178 0.13008130 0.02857604 0.13014640 -0.07929515 0.23375000 0.5174825
0.14982079 0.006633499 0.00255102 -0.01823456 -0.05755396 0.07659708 -0.04333138 0.04066986 -0.04457953 -0.2465438"
)
summary <- ram %>%
gather(colname, value) %>%
group_by(colname) %>%
summarise_at(.vars = vars(value), .funs = funs(mean = mean, sum = sum, max = max)) %>%
arrange(desc(sum))
summary
#> # A tibble: 10 x 4
#> colname mean sum max
#> <chr> <dbl> <dbl> <dbl>
#> 1 ALNY 0.152 0.304 0.322
#> 2 ACAD 0.152 0.303 0.297
#> 3 CORT 0.135 0.271 0.517
#> 4 CLVS 0.0946 0.189 0.234
#> 5 ABMD 0.0939 0.188 0.150
#> 6 ALGN 0.0663 0.133 0.130
#> 7 ASCMA 0.0526 0.105 0.0766
#> 8 AVGO 0.0434 0.0868 0.130
#> 9 ANIP 0.0363 0.0725 0.130
#> 10 CALD -0.0193 -0.0386 0.0407
If you then want to reorder your original data frame, you can get the order from this summary output and index with it:
ram[summary$colname]
#> # A tibble: 2 x 10
#> ALNY ACAD CORT CLVS ABMD ALGN ASCMA AVGO ANIP
#> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 0.322 0.297 0.517 0.234 0.0379 0.130 0.0286 0.130 0.130
#> 2 -0.0182 0.00663 -0.247 -0.0446 0.150 0.00255 0.0766 -0.0433 -0.0576
#> # ... with 1 more variable: CALD <dbl>
Created on 2018-08-01 by the reprex package (v0.2.0).

R: Which is the tidy way to apply a function over various columns of each row of a data frame?

I would like to apply a function to all rows of a data frame where each application the columns as distinct inputs (not like mean, rather as parameters).
I wonder what the tidy way is to do the following:
# Data
successes <- c(0,3,6,15,15,17,12,9,22,33)
trials <- c(50,1788,1876,3345,1223,856,342,214,265,257)
prognosis <- 0.01*c(0.05,0.10,0.25,0.5,0.75,1.3,2,3.4,6,10)
test_data = data.frame(successes = successes, trials = trials,
prognosis = prognosis, p_value1 = NA, p_value2 = NA)
for(i in 1: nrow(test_data)){
test_data$p_value1[i] = binom.test(test_data$successes[i], test_data$trials[i],
test_data$prognosis[i], "less")$p.value
test_data$p_value2[i] = binom.test(test_data$successes[i], test_data$trials[i],
test_data$prognosis[i], "greater")$p.value
}
One possible way is this:
successes <- c(0,3,6,15,15,17,12,9,22,33)
trials <- c(50,1788,1876,3345,1223,856,342,214,265,257)
prognosis <- 0.01*c(0.05,0.10,0.25,0.5,0.75,1.3,2,3.4,6,10)
test_data = data.frame(successes = successes, trials = trials,
prognosis = prognosis, p_value1 = NA, p_value2 = NA)
library(dplyr)
test_data %>%
rowwise() %>%
mutate(p_value1 = binom.test(successes, trials, prognosis, "less")$p.value,
p_value2 = binom.test(successes, trials, prognosis, "greater")$p.value) %>%
ungroup()
# # A tibble: 10 x 5
# successes trials prognosis p_value1 p_value2
# <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 0. 50. 0.000500 0.975 1.00
# 2 3. 1788. 0.00100 0.893 0.266
# 3 6. 1876. 0.00250 0.806 0.330
# 4 15. 3345. 0.00500 0.396 0.697
# 5 15. 1223. 0.00750 0.975 0.0467
# 6 17. 856. 0.0130 0.966 0.0595
# 7 12. 342. 0.0200 0.978 0.0447
# 8 9. 214. 0.0340 0.805 0.306
# 9 22. 265. 0.0600 0.950 0.0786
# 10 33. 257. 0.100 0.943 0.0822
Or using a vectorized function without rowwise:
# create function and vectorise it
GetPvalue = function(s, t, p, alt) {binom.test(s, t, p, alt)$p.value}
GetPvalue = Vectorize(GetPvalue)
test_data %>%
mutate(p_value1 = GetPvalue(successes, trials, prognosis, "less"),
p_value2 = GetPvalue(successes, trials, prognosis, "greater"))
# successes trials prognosis p_value1 p_value2
# 1 0 50 0.0005 0.9753038 1.00000000
# 2 3 1788 0.0010 0.8933086 0.26613930
# 3 6 1876 0.0025 0.8061877 0.32975624
# 4 15 3345 0.0050 0.3963610 0.69722243
# 5 15 1223 0.0075 0.9748903 0.04667939
# 6 17 856 0.0130 0.9656352 0.05952219
# 7 12 342 0.0200 0.9781863 0.04473155
# 8 9 214 0.0340 0.8047247 0.30581962
# 9 22 265 0.0600 0.9503332 0.07855963
# 10 33 257 0.1000 0.9433326 0.08219425

Resources