Separate string column by row efficiently - r

I'm trying to separate a string column into two pieces based on chopping up the string. It's best illustrated with example below. rowwise does work, but given the size of the data.frame, I'd like to use a more efficient method. How can I avoid using rowwise?
library(dplyr)
library(stringr)
library(tidyr)
#make data
a <- "(1, 10)"
b <- "(10, 20)"
c <- "(20, 30)"
df <- data.frame(size = c(a,b,c))
# Goal is to separate the 'size' column into 'lower' and 'upper' by
# extracting the value contained in the parens and split by a comma.
# Once the column is split into 'upper' and 'lower' I will perform
# additional operations.
# DESIRED RESULT
size lower upper
<fct> <chr> <chr>
1 (1, 10) 1 10
2 (10, 20) 10 20
3 (20, 30) 20 30
# WHAT I HAVE TRIED
> #This works... but too inefficient
> df %>%
+ rowwise() %>%
+ mutate(lower = str_split(size, ",") %>% .[[1]] %>% .[1] %>%
+ str_split("\\(") %>% .[[1]] %>% .[2])
size lower
<fct> <chr>
1 (1, 10) 1
2 (10, 20) 10
3 (20, 30) 20
> # I'm not sure why this doesn't work
> df %>%
+ mutate(lower = str_split(size, ",") %>% .[[1]] %>% .[1] %>%
+ str_split("\\(") %>% .[[1]] %>% .[2])
size lower
1 (1, 10) 1
2 (10, 20) 1
3 (20, 30) 1
> #Not obivous how to use separate (tidyr)
> df %>%
+ separate(size, sep=",", c("lower", "upper"))
lower upper
1 (1 10)
2 (10 20)
3 (20 30)

You don't state your goal explicitly, but it seems like you want to extract the first number from a string. This is easy with stringi::str_extract_first_regex
library(stringi)
stri_extract_first_regex(df$size, "[0-9]+")
# [1] "1" "10" "20"
So in your case,
df %>% mutate(lower = as.numeric(stri_extract_first_regex, size, "[0-9]+"))
You can extract all numbers with stri_extract_all_regex.
Based on your edits:
df$nums = str_extract_all(df$size, "[0-9]+")
df$lower = as.numeric(sapply(df$nums, `[[`, 1))
df$upper = as.numeric(sapply(df$nums, `[[`, 2))
df
# size nums lower upper
# 1 (1, 10) 1, 10 1 10
# 2 (10, 20) 10, 20 10 20
# 3 (20, 30) 20, 30 20 30
Another way to go is to get rid of the parens and whitespace and then use separate:
df %>%
mutate(just_nums = str_replace_all(size, "[^0-9,]", "")) %>%
separate(just_nums, into = c("lower", "upper"))
# size lower upper
# 1 (1, 10) 1 10
# 2 (10, 20) 10 20
# 3 (20, 30) 20 30
The regex pattern "[^0-9,]" matches everything except numbers and commas.

For rowwise operations, I prefer data.table.
Try this
library(data.table)
library(stringi)
#make data
a <- "(1, 10)"
b <- "(10, 20)"
c <- "(20, 30)"
dt <- data.table(c(a,b,c))
dt[, lower := tstrsplit(V1, ",")[1]]
dt[, lower:= stri_replace_all_regex(lower, '\\(', '')]
dt

An option is to use tidyr::separate after removing both ( and ) from the data.
library(tidyverse)
df %>% mutate(size = gsub("\\(|)","",size)) %>% # Both ( and ) has been removed.
separate(size, c("Min", "Max"), sep = ",")
# Min Max
# 1 1 10
# 2 10 20
# 3 20 30

You are almost there. Here is my explanation for two approach, one is similar to yours:
In the first code, I have used unnest_tokens from tidytext package, which can split words on a different rows, since you want to extract the first item before the comma(I have assumed it basis your example, although you should mention it). I have choosen the first row basis this by using filter command.
In the second code, I have used the regex (note you can also use here str_replace also). Here I am using map(since the items returned by str_split is a list) to iterate the returned items and pass each by gsub, which can replace the regex matched with the back referencing items. Also to select only the first item, I have used [[1]] in the end of gsub.
library(tidyverse)
library(stringr)
library(tidytext)
df %>%
unnest_tokens(lower,size, token="words",drop=F) %>%
filter(row_number()%%2==T)
df %>%
mutate(lower = map(str_split(df$size, ","), function(x)gsub("\\((\\w+)","\\1",x)[[1]]))
Output:
# size lower
# 1 (1, 10) 1
# 2 (10, 20) 10
# 3 (20, 30) 20
In case you want to extract both the terms before and after the commas, you can use extract function as well.
tidyr::extract(df, size, c("lower", "upper"), regex= "\\((\\w+),\\s+(\\w+)\\)")
Output:
# lower upper
# 1 1 10
# 2 10 20
# 3 20 30

Related

Calcualte points in range with with different properties

I have the following data frame that contains points that originate from different samples. Each point has a type.
I need to calculate, for each point belonging to a given sample of a given type (for instance for "Sample_1" type "A") how many points of another type are around it in a given cutoff.
My current implementation uses "future.apply" and I was wondering if there is a more efficient way to solve this problem. The example here is limited and should run quickly, the real problem is composed of several thousands of lines and it's much slower.
In the end I store the results in a list.
This list has, for each element with "type" in "starting_point", the number of elements of type "target_point" in a threshold of 40.
library(future)
library(future.apply)
a_test=data.frame(ID=sample(c("Sample_1", "Sample_2", "Sample_3"), 100, replace=TRUE), type=sample(c("A", "B", "C", "D"), 100, replace = TRUE), xpos=sample(1:200, 100, replace=TRUE), ypos=sample(1:200, 100, replace=TRUE))
starting_point=c("A", "B")
target_point=c("C", "D")
threshold=40
result_per_pair=list()
for(sp in starting_point){
## Here I select a data frame of "Starting points" without looking
## from which ID they came from
sp_tdf=a_test[a_test$type==sp, ]
for(tp in target_point){
## Here I select a data frame of "Target points" without looking
## from which ID they came from
tp_tdf=a_test[a_test$type==tp, ]
## I use future_sapply here, parallelizing on each line of "sp_tdf"
plan(multisession)
elements_around=future_sapply(1:nrow(sp_tdf), function(x, sp_tdf, tp_tdf, treshold2){
xc=sp_tdf$xpos[x]
yc=sp_tdf$ypos[x]
### NOTE HERE: At this point I select the points that are in the same
### ID as the current line of sp_tdf
tp_tdf2=tp_tdf[tp_tdf$ID == sp_tdf$ID[x],]
ares=tp_tdf2[ (tp_tdf2$xpos-xc)^2 + (tp_tdf2$ypos-yc)^2 <threshold2, ]
return(nrow(ares))
},sp_tdf=sp_tdf, tp_tdf=tp_tdf, threshold2=threshold*threshold)
a_newcol=paste0(tp, "_around_", sp)
## we need to create a copy of sp_tdf otherwise we add columns to the
## initial sp_tdf and we memorize them in the wrong place in the list
sp_tdf_temp=sp_tdf
sp_tdf_temp[, a_newcol]=elements_around
result_per_pair[[ paste0(tp, "_around_", sp ) ]]=rbind(result_per_pair[[ paste0(tp, "_around_", sp ) ]], sp_tdf_temp)
}
}
You can see the type of table I get here:
head(result_per_pair[[1]])
$C_around_A
ID type xpos ypos C_around_A
1 Sample_2 A 26 74 1
2 Sample_3 A 64 8 1
3 Sample_3 A 121 2 1
5 Sample_2 A 62 94 0
You can try using RANN::nn2 function:
id_list <- split(a_test, a_test$ID)
res <- id_list %>%
map(~select(.x, xpos, ypos)) %>%
map(~RANN::nn2(.x, .x, k = nrow(.), searchtype = "radius", radius = threshold)) %>%
map(1) %>%
map2(
id_list,
function(x, y){
seq_len(nrow(x)) %>%
map(~x[.x,] %>% .[. > 0]) %>%
map(~y[.x,]) %>%
map("type") %>%
map_dfr(table) %>%
mutate(across(everything(), as.integer))
}
) %>%
map2_dfr(id_list, ~bind_cols(.y, .x))
Some time improvements might be done replacing tidyverse functions (hard to say how fast it is on your example). Result:
res %>% head()
ID type xpos ypos A B C D
Sample_1 C 48 157 0 0 3 1
Sample_1 D 177 97 1 1 1 3
Sample_1 C 10 71 0 0 3 0
Sample_1 C 71 168 1 1 2 0
Sample_1 D 82 48 1 0 1 2
Sample_1 C 165 71 3 3 1 1
where columns A-D represent count of types within same ID. I was using seed 123 for generating a_test. You can adapt algorithm to work with starting_point and target_point with spliting each id_list into two parts - those defined by starting_point and target_point and adapting data & query arguments in RANN::nn2.
edit
Function based on the ideas of upper comment:
f <- function(df, threshold, start = levels(df$type), target = levels(df$type)){
my_lists <- df %>%
filter(type %in% c(start, target)) %>%
split(.$ID) %>%
map(
function(x){
map(
list(start, target),
~filter(x, type %in% .x) %>% mutate(type = droplevels(type))
)
}
) %>%
discard(~any(map_int(.x, nrow) == 0))
indices <- my_lists %>%
map(
~RANN::nn2(
data = select(.x[[2]], xpos, ypos),
query = select(.x[[1]], xpos, ypos),
k = nrow(.x[[2]]),
searchtype = "radius",
radius = threshold
)
) %>%
map(1) %>%
map(function(x) seq_len(nrow(x)) %>% map(~x[.x,] %>% .[. > 0]))
my_lists %>%
map(2) %>%
map2(indices, function(x, y) map_dfr(y, ~summary(x[.x,]$type))) %>%
{map2_dfr(map(my_lists, 1), ., bind_cols)}
}
To count C around A with radius 40:
f(a_test, 40, "A", "C")

Create a loop for calculating values from a dataframe in R?

Let's say I make a dummy dataframe with 6 columns with 10 observations:
X <- data.frame(a=1:10, b=11:20, c=21:30, d=31:40, e=41:50, f=51:60)
I need to create a loop that evaluates 3 columns at a time, adding the summed second and third columns and dividing this by the sum of the first column:
(sum(b)+sum(c))/sum(a) ... (sum(e)+sum(f))/sum(d) ...
I then need to construct a final dataframe from these values. For example using the dummy dataframe above, it would look like:
value
1. 7.454545
2. 2.84507
I imagine I need to use the next function to iterate within the loop, but I'm fairly lost! Thank you for any help.
You can split your data frame into groups of 3 by creating a vector with rep where each element repeats 3 times. Then with this list of sub data frames, (s)apply the function of summing the second and third columns, adding them, and dividing by the sum of the first column.
out_vec <-
sapply(
split.default(X, rep(1:ncol(X), each = 3, length.out = ncol(X)))
, function(x) (sum(x[2]) + sum(x[3]))/sum(x[1]))
data.frame(value = out_vec)
# value
# 1 7.454545
# 2 2.845070
You could also sum all the columns up front before the sapply with colSums, which will be more efficient.
out_vec <-
sapply(
split(colSums(X), rep(1:ncol(X), each = 3, length.out = ncol(X)))
, function(x) (x[2] + x[3])/x[1])
data.frame(value = out_vec, row.names = NULL)
# value
# 1 7.454545
# 2 2.845070
You could use tapply:
tapply(colSums(X), gl(ncol(X)/3, 3), function(x)sum(x[-1])/x[1])
1 2
7.454545 2.845070
Here is an option with tidyverse
library(dplyr) # 1.0.0
library(tidyr)
X %>%
summarise(across(.fn = sum)) %>%
pivot_longer(everything()) %>%
group_by(grp = as.integer(gl(n(), 3, n()))) %>%
summarise(value = sum(lead(value)/first(value), na.rm = TRUE)) %>%
select(value)
# A tibble: 2 x 1
# value
# <dbl>
#1 7.45
#2 2.85

Add the number in every row and take the sum

Having a dataframe like this
data.frame(id = c(1,2), num = c("30, 4, -2,","10, 20"))
How is it possible to take the sum of every row from the column num, and include the minuse into the calculation?
Example of expected output?
data.frame(id = c(1,2), sum = c(32, 30)
Using Base R you could do the following:
# data
df <- data.frame(id = c(1,2), num = c("30, 4, -2,","10, 20"))
# split by ",", convert to numeric and then sum
df[, 2] <- sapply(strsplit(as.character(df$num), ","), function(x){
sum(as.numeric(x))
})
# result
df
# id num
# 1 1 32
# 2 2 30
If you can use packages, the tidy packages make this easy and use tidy data principals which are quick and easy once you get used to thinking this way.
library(tidyr)
library(dplyr)
df %>%
# Convert the string of numbers to a tidy dataframe
# with one number per row with the id column for grouping
separate_rows(num,sep = ",") %>%
# Convert the text to a number so we can sum
mutate(num = as.numeric(num)) %>%
# Perform the calculation for each id
group_by(id) %>%
# Sum the number
summarise(sum = sum(num,na.rm = TRUE)) %>%
# Ungroup for further use of the data
ungroup()
# A tibble: 2 x 2
# id sum
# <dbl> <dbl>
# 1 1 32
# 2 2 30
library(stringr)
df <- data.frame(id = c(1,2), num = c("30, 4, -2","10, 20"))
df$sum <- NA
for (i in 1:nrow(df)) {
temp <- as.character(df[i,2])
n_num <- str_count(temp, '[0-9.]+')
total <- 0
for (j in 1:n_num) {
digit <- strsplit(temp, ',')[[1]][j]
total <- total + as.numeric(digit)
temp <- sub(digit, '', temp)
}
df[i, 'sum'] <- total
}
print(df)
id num sum
1 1 30, 4, -2 32
2 2 10, 20 30

How to identify repeated subsequences in a dataset

I have a dataset of numerical values, each represent a zone.
eg.
x <- c(1,6,1,2,3,4,5,8,5,9,10,1,2,3,10,7,5,9,4,1,2,3)
I need to identify whether there are repeated subsequences within the data, i.e whether the subject repeatedly travelled from zone 1 to 2 to 3. In the above example 1,2,3 would give a value of 3. I don't know the subsequences already, I need R to provide this given the data.
Following that I need to calculate how many times this subsequence appears in the data.
Very basic knowledge or R so forgive me for my ignorance if this is a simple task!
Here's a way to find which sequences of length n repeat, and how many times
For n = 3
library(tidyverse) # not necessary, see base version below
n <- 3
lapply(seq(0, length(x) - n), `+`, seq(n)) %>% # get index of all subsequences
map_chr(~ paste(x[.], collapse = ',')) %>% # paste together as character
table %>% # get number of times each occurs
`[`(. > 1) # select sequences occurring > 1 time
# 1,2,3
# 3
For n = 2
n <- 2
lapply(seq(0, length(x) - n), `+`, seq(n)) %>%
map_chr(~ paste(x[.], collapse = ',')) %>%
table %>%
`[`(. > 1)
# 1,2 2,3 5,9
# 3 3 2
Without Tidyverse
seqs <- lapply(seq(0, length(x) - n), `+`, seq(n))
seqs.char <- sapply(seqs, function(i) paste(x[i], collapse = ','))
tbl <- table(seqs.char)
tbl[tbl > 1]
I'll add my own question: Does anyone know how to do this without converting to character first? e.g. fun where fun(list(1:2, 1:2, 2:3)) tells you 1:2 occurs twice and 2:3 occurs once?
An alternative tidyverse approach that creates a big dataframe of results based on how many values you want your subsequences to have:
library(tidyverse)
# example vector
x <- c(1,6,1,2,3,4,5,8,5,9,10,1,2,3,10,7,5,9,4,1,2,3)
# function that gets as input number of consequtive elements in a subsequence
# and returns an ordered dataframe by counts of occurence
f = function(n) {
data.frame(value = x) %>% # get the vector x
slice(1:(nrow(.)-n+1)) %>% # remove values not needed from the end
mutate(position = row_number()) %>% # add position of each value
rowwise() %>% # for each value/row
mutate(vec = paste0(x[position:(position+n-1)], collapse = ",")) %>% # create subsequences as a string
ungroup() %>% # forget the grouping
count(vec, sort = T) } # order by counts descending
2:5 %>% # specify how many values in your subsequences you want to investigate (let's say from 2 to 5)
map_df(~ data.frame(NumElements = ., f(.))) %>% # apply your function and keep the number values
arrange(desc(n)) %>% # order by counts descending
tbl_df() # (only for visualisation purposes)
# # A tibble: 88 x 3
# NumElements vec n
# <dbl> <chr> <int>
# 1 2 1,2 3
# 2 2 2,3 3
# 3 3 1,2,3 3
# 4 2 5,9 2
# 5 2 1,6 1
# 6 2 10,1 1
# 7 2 10,7 1
# 8 2 3,10 1
# 9 2 3,4 1
# 10 2 4,1 1
# # ... with 78 more rows
The approach below finds sequences of any length (k): the input vector is converted into a matrix with k rows; this is done k times with adding 0:(k-1) NA's to the beginning. Finally, all rows in these k matrices are counted (paste'ing the elements together):
frs <- function(x, k=2){
padit <- function(.) c(.,rep(NA, k-length(.)%%k))
xx <- lapply(1:k, function(iii) padit(c(rep(NA,iii-1), x)))
xx <- do.call(rbind, lapply(xx, function(.) matrix(., ncol=k, byrow=TRUE)))
xx <- sapply(split(xx, 1:NROW(xx)), paste, collapse=",")
(function(x) x[x>1])(table(xx))
}
Output:
> frs(x,2)
xx
1,2 2,3 5,9
3 3 2
> frs(x,3)
1,2,3
3
> frs(x,4)
named integer(0)

How to change code T-25-4 into T-25-04 in a dataframe in R?

I have a data.frame in R. The first columns contain codes like T-25-4. I want to change it to T-25-04 and so on. So the last number should be in 2 digits
Example:
T-25-1
T-25-2
T-25-3
T-25-4
T-25-5
T-25-6
T-25-7
T-25-8
T-25-9
Borrowing first part of ycw's answer, but simpler with mutate and gsub:
library(tidyverse)
dt <- data_frame(Col = c("T-25-1", "T-25-2", "T-25-3", "T-25-4", "T-25-5",
"T-25-6", "T-25-7", "T-25-8", "T-25-9"))
dt %>%
mutate(Col = gsub("(\\d)$", paste0("0", "\\1"), Col))
If last digit goes higher than 9 and you don't want to add 0:
dt %>%
mutate(Col = ifelse(nchar(sub(".*-(\\d+)$", "\\1", Col)) < 2, # Check if last number is less than 10
sub("(\\d+)$", paste0("0", "\\1"), Col), # Add 0 in front if less than 10
Col))
We can use functions from tidyverse and stringr. df2 is the final output.
library(tidyverse)
library(stringr)
# Create example data frame
dt <- data_frame(Col = c("T-25-1", "T-25-2", "T-25-3", "T-25-4", "T-25-5",
"T-25-6", "T-25-7", "T-25-8", "T-25-9"))
# Process the data
dt2 <- dt %>%
# Separate the original column to three columns
separate(Col, into = c("Col1", "Col2", "Col3")) %>%
# Pad zero to Col3 until the width is 2
mutate(Col3 = str_pad(Col3, width = 2, side= "left", pad = "0")) %>%
# Combine all three columns separated by "-
unite(Col, Col1:Col3, sep = "-")
# View the reuslts
dt2
# A tibble: 9 x 1
Col
* <chr>
1 T-25-01
2 T-25-02
3 T-25-03
4 T-25-04
5 T-25-05
6 T-25-06
7 T-25-07
8 T-25-08
9 T-25-09

Resources