R strip split a column in dataframe - r

I have a 'data' frame, with multiple columns, one of them being 'Runtime' which has data in two formats:
Runtime
1 h 10 min
67 min
1 h 0 min
86 min
97 min
I want to convert all of them into Minutes. Have tried 'strsplit' and 'strip_split_fixed'. Can anyone show me a way to achieve my goal, split or any other method?
Thank you in advance !

I think I saw this kind of solution somewhere. Don't hit me.
df = data.frame(Runtime = c('1 h 10 min', '67 min', '1 h 0 min', '86 min', '97 min'))
df$exp <- gsub("h", "* 60 +", df$Runtime)
df$exp <- gsub("min", "* 1", df$exp)
sapply(df$exp, FUN = function(x) eval(parse(text = x)))
1 * 60 + 10 * 1 67 * 1 1 * 60 + 0 * 1 86 * 1 97 * 1
70 67 60 86 97

You can get it one call using gsubfn and regex:
library(gsubfn)
gsubfn("^(?:(\\d+)\\s*h)?\\s*(\\d+)\\s*min.*$",
~ sum(as.numeric(x) * 60, as.numeric(y), as.numeric(z), na.rm=TRUE), x)
#[1] "70" "67" "60" "86" "97"

Here's an example of how you can do it:
# setting up your data.frame of interest
df = data.frame(Runtime = c('1 h 10 min', '67 min', '1 h 0 min', '86 min', '97 min'))
df$Runtime = gsub(' min', '', df$Runtime) # remove the min labels
hrs = grepl('h', x = df$Runtime) # which values are in an "x h y min" format?
runtime_sub = sapply(strsplit(df[hrs, 'Runtime'], ' h '), function(i) sum(as.numeric(i) * c(60, 1))) # convert the "x h y min" entries into numeric values in minutes
df$Runtime = as.numeric(df$Runtime) # convert the vector to numeric (yes, it's supposed to return a warning. Ignore it.
df[hrs, 'Runtime'] = runtime_sub # add the converted values
This results in:
Runtime
1 70
2 67
3 60
4 86
5 97

1) Read df[[1]] and if the third column is NA then the first column gives the minutes; otherwise, 60 times the first column plus the third column gives the minutes:
with(read.table(text = as.character(df[[1]]), fill = TRUE),
ifelse(is.na(V3), V1, 60*V1 + V3))
## [1] 70 67 60 86 97
2) A variation is to paste "0 h" at the beginning of each component that does not have an h giving hm and read that computing 60 times the first column plus the third column.
hm <- paste(ifelse(grepl("h", df[[1]]), "", "0 h"), df[[1]])
with(read.table(text = hm), 60 * V1 + V3)
## [1] 70 67 60 86 97

Related

How to randomly select row from a dataframe for which the row skewness is larger that a given value in R

I am trying to select random rows from a data frame with 1000 lines (and six columns) where the skewness of the line is larger than a given value (say Sk > 0.3).
I've generated the following data frame
df=data.frame(replicate(6,sample(10:100,1000,rep=TRUE)))
I can get row skewness from the fbasics package:
rowSkewness(df) gives:
[8] -0.2243295435 0.5306809351 0.0707122386 0.0341447417 0.3339384838 -0.3910593364 -0.6443905090
[15] 0.5603809206 0.4406091534 -0.3736108832 0.0397860038 0.9970040772 -0.7702547535 0.2065830354
But now, I need to select say 10 rows of the df which have rowskewness greater than say 0.1... May with
for (a in 1:10) {
sample.data[a,] = sample(x=df[which(rowSkewness(df[sample(1:nrow(df),1)>0.1),], size = 1, replace = TRUE)
}
or something like this?
Any thoughts on this will be appreciated.
thanks in advance.
you can use the sample_n() function or sample_frac() - makes your version a little shorter:
library(tidyr)
library(fBasics)
df=data.frame(replicate(6,sample(10:100,1000,rep=TRUE)))
x=df %>% dplyr::filter(rowSkewness(df)>0.1) %>% dplyr::sample_n(10)
Got it:
x=df %>% filter(rowSkewness(df)>0.1)
for (a in 1:samplesize) {
sample.data[a,] = sample(x=x, size = 1, replace = TRUE)
}
Just do a subset:
res1 <- DF[fBasics::rowSkewness(DF) > .1, ]
head(res1)
# X1 X2 X3 X4 X5 X6
# 7 56 28 21 93 74 24
# 8 33 56 23 44 10 12
# 12 29 19 29 38 94 95
# 13 35 51 54 98 66 10
# 14 12 51 24 23 36 68
# 15 50 37 81 22 55 97
Or with e1071::skewness:
res2 <- DF[apply(as.matrix(DF), 1, e1071::skewness) > .1, ]
stopifnot(all.equal(res1, res2))
Data
set.seed(42); DF <- data.frame(replicate(6, sample(10:100, 1000, rep=TRUE)))

Finding the overlap between two data frames in R, how can I make my code more efficient?

I have two dataframes in R. In the first one I have two columns one is called "chr" and the other "position"; in the second dataframe I have three columns one is again "chr", other "start" and another one "end". I want to select those rows in the first dataframe in which chr value is the same as the second data frame, but also whose "position" is in the interval start-end of the second data frame.
For that I have written a function in R that gives me the desired output but it is very slow when I run it with huge data frames.
# My DataFrames are:
bed <- data.frame(Chr = c(rep("chr1",4),rep("chr2",3),rep("chr3",1)),
x1 = c(5,20,44,67,5,20,44,20),
x3=c(12,43,64,94,12,43,64,63))
snv <- data.frame(Chr = c(rep("chr1",6),rep("chr3",6)),
position = c(5,18,46,60,80,90,21,60,75,80,84,87))
# My function is:
get_overlap <- function(df, position, chrom){
overlap <- FALSE
for (row in 1:nrow(df)){
chr = df[row, 1]
start = df[row, 2]
end = df[row, 3]
if(chr == chrom & position %in% seq(start, end)){
overlap <- TRUE
}
}
return(overlap)
}
# The code is:
overlap_vector = c()
for (row in 1:nrow(snv)){
chrom = snv[row, 1]
position = snv[row, 2]
overlap <- get_overlap(bed, position, chrom)
overlap_vector <- c(overlap_vector, overlap)
}
print(snv[overlap_vector,])
How can I make this more efficient? I have never worked with hash tables, can that be the answer?
I'm sure there's a more elegant data.table solution, but this works. First I load the package.
# Load package
library(data.table)
Then, I define the data tables
# Define data tables
bed <- data.table(Chr = c(rep("chr1",4),rep("chr2",3),rep("chr3",1)),
start = c(5,20,44,67,5,20,44,20),
end = c(12,43,64,94,12,43,64,63))
snv <- data.table(Chr = c(rep("chr1",6),rep("chr3",6)),
position = c(5,18,46,60,80,90,21,60,75,80,84,87))
Here, I do a non-equi join on position and start/end, and an equal join on Chr. I assume you want to keep all columns, so specified them in the j argument and omitted those rows without matches.
na.omit(bed[snv,
.(Chr, start = x.start, end = x.end, position = i.position),
on = c("start <= position", "end >= position", "Chr == Chr")])
#> Chr start end position
#> 1: chr1 5 12 5
#> 2: chr1 44 64 46
#> 3: chr1 44 64 60
#> 4: chr1 67 94 80
#> 5: chr1 67 94 90
#> 6: chr3 20 63 21
#> 7: chr3 20 63 60
Created on 2019-08-21 by the reprex package (v0.3.0)
Edit
A quick benchmarking shows that Nathan's solution is about as twice as fast!
Unit: milliseconds
expr min lq mean median uq max neval
NathanWren() 1.684392 1.729557 1.819263 1.751520 1.787829 5.138546 100
Lyngbakr() 3.336902 3.395528 3.603376 3.441933 3.496131 7.720925 100
The data.table package is great for fast merging of tables. It also comes with a vectorized between function for just this type of task.
library(data.table)
# Convert the data.frames to data.tables
setDT(bed)
setDT(snv)
# Use the join syntax for data.table, then filter for the desired rows
overlap_dt <- bed[
snv,
on = "Chr",
allow.cartesian = TRUE # many-to-many matching
][
between(position, lower = x1, upper = x3)
]
overlap_dt
# Chr x1 x3 position
# 1: chr1 5 12 5
# 2: chr1 44 64 46
# 3: chr1 44 64 60
# 4: chr1 67 94 80
# 5: chr1 67 94 90
# 6: chr3 20 63 21
# 7: chr3 20 63 60

If() statement in R

I am not very experienced in if statements and loops in R.
Probably you can help me to solve my problem.
My task is to add +1 to df$fz if sum(df$fz) < 450, but in the same time I have to add +1 only to max values in df$fz till that moment when when sum(df$fz) is lower than 450
Here is my df
ID_PP <- c(3,6, 22, 30, 1234456)
z <- c(12325, 21698, 21725, 8378, 18979)
fz <- c(134, 67, 70, 88, 88)
df <- data.frame(ID_PP,z,fz)
After mutating the new column df$new_value, it should look like 134 68 71 88 89
At this moment I have this code, but it adds +1 to all values.
if (sum(df$fz ) < 450) {
mutate(df, new_value=fz+1)
}
I know that I can pick top_n(3, z) and add +1 only to this top, but it is not what I want, because in that case I have to pick a top manually after checking sum(df$fz)
From what I understood from #Oksana's question and comments, we probably can do it this way:
library(tidyverse)
# data
vru <- data.frame(
id = c(3, 6, 22, 30, 1234456),
z = c(12325, 21698, 21725, 8378, 18979),
fz = c(134, 67, 70, 88, 88)
)
# solution
vru %>% #
top_n(450 - sum(fz), z) %>% # subset by top z, if sum(fz) == 450 -> NULL
mutate(fz = fz + 1) %>% # increase fz by 1 for the subset
bind_rows( #
anti_join(vru, ., by = "id"), # take rows from vru which are not in subset
. # take subset with transformed fz
) %>% # bind thous subsets
arrange(id) # sort rows by id
# output
id z fz
1 3 12325 134
2 6 21698 68
3 22 21725 71
4 30 8378 88
5 1234456 18979 89
The clarifications in the comments helped. Let me know if this works for you. Of course, you can drop the cumsum_fz and leftover columns.
# Making variables to use in the calculation
df <- df %>%
arrange(fz) %>%
mutate(cumsum_fz = cumsum(fz),
leftover = 450 - cumsum_fz)
# Find the minimum, non-negative value to use for select values that need +1
min_pos <- min(df$leftover[df$leftover > 0])
# Creating a vector that adds 1 using the min_pos value and keeps
# the other values the same
df$new_value <- c((head(sort(df$fz), min_pos) + 1), tail(sort(df$fz), length(df$fz) - min_pos))
# Checking the sum of the new value
> sum(df$new_value)
[1] 450
>
> df
ID_PP z fz cumsum_fz leftover new_value
1 6 21698 67 67 383 68
2 22 21725 70 137 313 71
3 30 8378 88 225 225 89
4 1234456 18979 88 313 137 88
5 3 12325 134 447 3 134
EDIT:
Because utubun already posted a great tidyverse solution, I am going to translate my first one completely to base (it was a bit sloppy to mix the two anyway). Same logic as above, and using the data OP provided.
> # Using base
> df <- df[order(fz),]
>
> leftover <- 450 - cumsum(fz)
> min_pos <- min(leftover[leftover > 0])
> df$new_value <- c((head(sort(df$fz), min_pos) + 1), tail(sort(df$fz), length(df$fz) - min_pos))
>
> sum(df$new_value)
[1] 450
> df
ID_PP z fz new_value
2 6 21698 67 68
3 22 21725 70 71
4 30 8378 88 89
5 1234456 18979 88 88
1 3 12325 134 134

Sum and place it elsewhere in R

I have one column with 950 numbers. I want to sum row 1:40 and place it in a new column on row 50, then sum row 2:41 and place it on row 51 in the new column and so on. How do I do?
You can use the function RcppRoll::roll_sum()
Hope this helps:
r <- 50
df1 <- data.frame(c1 = 1:951)
v1 <- RcppRoll::roll_sum(df1$c1, n=40)
df1$c2 <- c(rep(NA, r), v1[1:(nrow(df1)-r)])
View(df1) # in RStudio
You decide what happens with the sum from row 911 onwards (I've ignored them)
You can use RcppRoll::roll_sum() and dplyr::lag()...
df <- data.frame(v = 1:950)
library(dplyr)
library(RcppRoll)
range <- 40 # how many values to sum, i.e. window size
offset <- 10 # e.g sum(1:40) goes to row 50
df <- mutate(df, roll_sum = RcppRoll::roll_sum(lag(v, n = offset),
n = range, fill = NA, align = "right"))
df[(range+offset):(range+offset+5), ]
# v roll_sum
# 50 50 820
# 51 51 860
# 52 52 900
# 53 53 940
# 54 54 980
# 55 55 1020
sum(1:range); sum(2:(range+1))
# [1] 820
# [1] 860

R: Formatting youtube video duration into proper time (seconds)

I have vector (column data) which contains youtube playback duration in a character string format in R.
x <- c(PT1H8S, PT9M55S, PT13M57S, PT1M5S, PT30M12S, PT1H21M5S, PT6M48S, PT31S, PT2M)
How do I get rid of PT then get the overall duration in seconds format?
Resultant vector should be c(3608, 595, 837, 65, 1812, 4865, 408, 31, 120)
example: PT1H21M5S in the form of seconds = 4865.
(calculated as 1H = 1*3600, 21M = 21*60, 5S = 5*1)
I wrote a little apply loop with regex commands, deleting everything but the seconds, minutes, or hours and then converting everything into seconds.
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S")
x2 <- sapply(x, function(i){
t <- as.numeric(gsub("^(.*)M|^(.*)H|S$", "", i))
if(grepl("M", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)H|M(.*)$", "",i)) * 60
if(grepl("H", i)) t <- t + as.numeric(gsub("^(.*)PT|H(.*)$", "",i)) * 3600
t
})
x2
PT1H8S PT9M55S PT13M57S PT1M5S PT30M12S PT1H21M5S PT6M48S
3608 595 837 65 1812 4865 408
EDIT: Per request
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S", "PT31S", "PT2M")
x2 <- sapply(x, function(i){
t <- 0
if(grepl("S", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)M|^(.*)H|S$", "", i))
if(grepl("M", i)) t <- t + as.numeric(gsub("^(.*)PT|^(.*)H|M(.*)$", "",i)) * 60
if(grepl("H", i)) t <- t + as.numeric(gsub("^(.*)PT|H(.*)$", "",i)) * 3600
t
})
x2
PT1H8S PT9M55S PT13M57S PT1M5S PT30M12S PT1H21M5S PT6M48S PT31S PT2M
3608 595 837 65 1812 4865 408 31 120
This should cover all the cases. If there are more, the trick is to alter the regex. ^ is the beginning of the character vector, $ is the end. (.*) is everything. So ^(.*)H means everything between beginning and H. We replace this with nothing.
Here's a dplyr and stringr solution:
df %>%
# extract hours, minutes, and seconds and convert to numeric:
mutate(
h = as.numeric(str_extract(x, "(?<=PT)\\d+(?=H)")),
m = as.numeric(str_extract(x, "(?<=PT|H)\\d+(?=M)")),
s = as.numeric(str_extract(x, "(?<=PT|H|M)\\d+(?=S)"))
) %>%
# replace NA with 0:
mutate(
across(everything(), replace_na, 0)
) %>%
# calculate time in seconds:
mutate(sec = h*3600+m*60+s)
x h m s sec
1 PT1H8S 1 0 8 3608
2 PT9M55S 0 9 55 595
3 PT13M57S 0 13 57 837
4 PT1M5S 0 1 5 65
5 PT30M12S 0 30 12 1812
6 PT1H21M5S 1 21 5 4865
7 PT6M48S 0 6 48 408
8 PT31S 0 0 31 31
9 PT2M 0 2 0 120
Data:
df <- data.frame(x = c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S", "PT31S", "PT2M"))
You can use Lubridate package:
library(lubridate)
x <- c("PT1H8S", "PT9M55S", "PT13M57S", "PT1M5S", "PT30M12S", "PT1H21M5S", "PT6M48S")
x2 <- as.numeric(duration(x))
x2
[1] 3608 595 837 65 1812 4865 408

Resources