Efficient way to cbind list by groups in data.table - r

I have a data.frame
data
data = structure(list(mystring = c("AASDAASADDLKJLKADDLKKLLKJLJADDLJLKJLADLKLADD",
"ASDSDFJSKADDKJSJKDFKSADDLKJFLAK"), class = c("cat", "dog")), .Names = c("mystring",
"class"), row.names = c(NA, -2L), class = "data.frame")
which looks like
#> dtt1
# mystring class
#1 AASDAASADDLKJLKADDLKKLLKJLJADDLJLKJLADLKLADD cat
#2 ASDSDFJSKADDKJSJKDFKSADDLKJFLAK dog
I am searching the start and end positions of a pattern "ADD" with in the first 20 characters in the strings under mystring considering class as the group.
I am doing this using str_locate of stringr package. Here is my attempt
setDT(dtt1)[,
cbind(list(str_locate_all(substr(as.character(mystring), 1, 20),"ADD")[[1]][,1]),
list(str_locate_all(substr(as.character(mystring), 1, 20),"ADD")[[1]][,2])),
by = class]
This gives my desired output
# class V1 V2
#1: cat 8 10
#2: cat 16 18
#3: dog 10 12
Question:
I would like to know if this is a standard approach or this can be done in a more efficient manner. str_locate gives the start and end positions of the matched pattern in separate columns, and I am putting them in separate list to cbind them together with the data.table? Also how can I specify the colnames for the cbinded columns here?

I think you first should reduce your operations per group, so I would first create a substring for all groups at once.
setDT(data)[, submystring := .Internal(substr(mystring, 1L, 20L))]
Then, using the stringi package (I don't like wrappers), you could do (though can't currently vouch for efficiency)
library(stringi)
data[, data.table(matrix(unlist(stri_locate_all_fixed(submystring, "ADD")), ncol = 2)), by = class]
# class V1 V2
# 1: cat 8 10
# 2: cat 16 18
# 3: dog 10 12
Alternatively, you could avoid matrix and data.table calls per group but spread the data after all the location were detected
res <- data[, unlist(stri_locate_all_fixed(submystring, "ADD")), by = class]
res[, `:=`(varnames = rep(c("V1", "V2"), each = .N/2), MatchCount = rep(1:(.N/2), .N/2)), by = class]
dcast(res, class + MatchCount ~ varnames, value.var = "V1")
# class MatchCount V1 V2
# 1: cat 1 8 10
# 2: cat 2 16 18
# 3: dog 1 10 12
Third similar option could be to try first run stri_locate_all_fixed over the whole data set and only then to unlist per group (instead of running both and unlist and stri_locate_all_fixed per group)
res <- data[, .(stri_locate_all_fixed(submystring, "ADD"), class = class)]
res[, N := lengths(V1)/2L]
res2 <- res[, unlist(V1), by = "class,N"]
res2[, `:=`(varnames = rep(c("V1", "V2"), each = N[1L]), MatchCount = rep(1:(N[1L]), N[1L])), by = class]
dcast(res2, class + MatchCount ~ varnames, value.var = "V1")
# class MatchCount V1 V2
# 1: cat 1 8 10
# 2: cat 2 16 18
# 3: dog 1 10 12

We could change the matrix output from str_locate_all to data.frame and use rbindlist to create the columns.
setDT(data)[,rbindlist(lapply(str_locate_all(substr(mystring, 1, 20),
'ADD'), as.data.frame)) , class]
# class start end
#1: cat 8 10
#2: cat 16 18
#3: dog 10 12

Here's how I did it.
library(stringi)
library(dplyr)
library(magrittr)
data = structure(list(mystring = c("AASDAASADDLKJLKADDLKKLLKJLJADDLJLKJLADLKLADD",
"ASDSDFJSKADDKJSJKDFKSADDLKJFLAK"), class = c("cat", "dog")), .Names = c("mystring",
"class"), row.names = c(NA, -2L), class = "data.frame")
my_function = function(row)
row$mystring %>%
stri_sub(to = 20) %>%
stri_locate_all_fixed(pattern = "ADD") %>%
extract2(1) %>%
as_data_frame
test =
data %>%
group_by(mystring) %>%
do(my_function(.)) %>%
left_join(data)

Related

Adding column sums to a data.table as a new row at the end

I am doing some operations on a data.table and getting a result. So far so good. Next, I want the result to also show the sums across some columns, but I can't get that to work.
I filter my table by rows where x1=1, and compute a metric by Group1:
dt[x1 == 1, .N, by = c("Group1")][,
"%" := round(N /sum(N) * 100, 0)] [
]
giving
Group1 N %
1: 2 6 40
2: 1 6 40
3: 3 2 13
4: 5 1 7
I would just like to add a row to the above table that gives the sum across all columns.
I can just do
colSums(.Last.value)
and get the answer in a in a separate console, but what if I wanted to just append a new row to the above table itself, something like:
Group1 N %
1: 2 6 40
2: 1 6 40
3: 3 2 13
4: 5 1 7
ColSum: -- 15 100
Since I don't understand your sample dataset, I guess this can help solve your issue.
I would suggest that you use the janitor package to wrap up your column total or row total
See sample below
library(janitor)
set.seed(10)
df_sample<- sample(1:nrow(iris), 10)
df<-iris[df_sample, ]
#This would sum all the rows together and return total
df%>%
select(Species,Sepal.Width, Petal.Length, Petal.Width)%>%
adorn_totals(where = "row")
#This would sum all columns and return total
df%>%
select(Species,Sepal.Width, Petal.Length, Petal.Width)%>%
adorn_totals(where = "col")
I hope that this answered your question.
As a hacked mod to akrun's (since deleted) answer, here's a custom printing function that works around data.table's omission of row names.
prettyDT <- function(x, ...) {
out <- capture.output(data.table:::print.data.table(x, ...))
nms <- rownames(x)
gre <- gregexpr("^([0-9]+)(?=:)", out, perl = TRUE)
newnms <- nms[as.integer(regmatches(out, gre), nms)]
wids <- nchar(newnms)
newnms[!is.na(wids)] <- sprintf(paste0("%", max(wids, na.rm = TRUE), "s"), newnms[!is.na(wids)])
regmatches(out, gre)[!is.na(wids)] <- newnms[!is.na(wids)]
pre <- strrep(" ", diff(range(wids, na.rm = TRUE)))
out[is.na(wids)] <- paste0(pre, out[is.na(wids)])
cat(out, sep = "\n")
}
With this, we can do:
out <- rbindlist(list(
DT,
DT[, c(.(Group1 = "--"), lapply(.SD, sum)), .SDcols = c("N", "%")]
))
rownames(out)[nrow(out)] <- "Colsum"
prettyDT(out)
# Group1 N %
# <char> <int> <int>
# 1: 2 6 40
# 2: 1 6 40
# 3: 3 2 13
# 4: 5 1 7
# Colsum: -- 15 100
Admittedly, this is a bit of a hack, and requires explicit calling of a udf to get the desired output.
Data
DT <- setDT(structure(list(Group1 = c("2", "1", "3", "5"), N = c(6L, 6L, 2L, 1L), "%" = c(40L, 40L, 13L, 7L)), class = c("data.table", "data.frame"), row.names = c(NA, -4L)))

Adding a numbered prefix to column names in a for loop

I have 50 dataframes that all have the same column names (e.g. df1: colnames = Id, A,B,C,D, df2: colnames = ID, A,B,C,D and so on).
I need to rename these so it becomes df1: colnames = ID, Mth1_A, Mth1_B, Mth1_C, Mth1_D and then df2: ID, Mth2_A, Mth2_B, Mth2_C, Mth2_D. So each column name should correspond to the number of the dataframe.
I've created a function that does this;
col_prefix <- function(df, Mth){
colnames(df)[2:ncol(df)] <- paste("Mth", colnames(df)[2:ncol(df)], sep = "_")
return(df)
}
But I'm now trying to create a loop to do it for all 50 and I can't get it to work. This is what I've got so far
dfList <- c("df1", "df2",...,"df50")
for (filename in dfList){
i <- get(filename)
i <- col_prefix(i, Mth)
}
Its adding the prefix "Mth" to the datafarmes but its not doing "Mth1", "Mth2", etc. I'm fairly sure this is because in my function Mth is a character but I don't know how to loop through this.
Please help!
Put them in a list and use their name (df1, df2, etc...)to catch the prefix, i.e.
l1 <- mget(grep(pattern = "df[0-9]+", x = ls(), value = TRUE))
Map(function(x, y) setNames(x, paste0('MTH', gsub('\\D+', '', y), '_', names(x))),
l1, names(l1))
$df1
MTH1_v1 MTH1_v2
1 5 9
2 6 10
3 7 11
$df2
MTH2_v1 MTH2_v2
1 15 19
2 16 110
3 17 111
To change all names except the first one then,
Map(function(x, y) data.frame(x[1], setNames(x[-1], paste0('MTH', gsub('\\D+', '', y), '_', names(x)[-1]))), l1, names(l1))
$df1
v1 MTH1_v2
1 5 9
2 6 10
3 7 11
$df2
v1 MTH2_v2
1 15 19
2 16 110
3 17 111
DATA
dput(df1)
structure(list(v1 = c(5, 6, 7), v2 = c(9, 10, 11)), class = "data.frame", row.names = c(NA,
-3L))
dput(df2)
structure(list(v1 = c(15, 16, 17), v2 = c(19, 110, 111)), class = "data.frame", row.names = c(NA,
-3L))

Splitting values in a column

sorry I'm new to R but I've got some data that looks like the following:
I'd like count the number of times each object is mentioned in the findings. So the result would look like this:
I've tried tidyverse and separate but can't seem to get the hang of it, any help would be amazing, thanks in advance!
To recreate my data:
df <- data.frame(
col_1 = paste0("image", 1:5),
findings = c("rock|cat|sun", "cat", "cat|dog|fish|sun", "sun", "dog|cat")
)
You can use separate_rows() and then count().
library(tidyverse)
df %>%
separate_rows(findings) %>%
count(findings)
# # A tibble: 5 x 2
# findings n
# <chr> <int>
# 1 cat 4
# 2 dog 2
# 3 fish 1
# 4 rock 1
# 5 sun 3
Data
df <- structure(list(col_1 = c("image_1", "image_2", "image_3", "image_4",
"image_5"), findings = c("rock|cat|sun", "cat", "cat|dog|fish|sun",
"sun", "dog|cat")), class = "data.frame", row.names = c(NA, -5L))
In base R:
as.data.frame(table(unlist(strsplit(df$col_2, "|", fixed = TRUE))))
# Var1 Freq
# 1 cat 4
# 2 dog 2
# 3 fish 1
# 4 rock 1
# 5 sun 3
Reproducible data (please provide it in your next post):
df <- data.frame(
col_1 = paste0("image", 1:5),
col_2 = c("rock|cat|sun", "cat", "cat|dog|fish|sun", "sun", "dog|cat")
)
An option with cSplit
library(splitstackshape)
cSplit(df, 'col_2', 'long', sep="|")[, .N, col_2]
# col_2 N
#1: rock 1
#2: cat 4
#3: sun 3
#4: dog 2
#5: fish 1
data
df <- structure(list(col_1 = c("image1", "image2", "image3", "image4",
"image5"), col_2 = c("rock|cat|sun", "cat", "cat|dog|fish|sun",
"sun", "dog|cat")), class = "data.frame", row.names = c(NA, -5L
))
Using tidyverse:
df %>%
separate_rows(findings) %>%
group_by(findings) %>%
summarize(total_count_col=n())
First we convert the data into a long format using separate_rows, then group and count the number of rows with each finding.
Example:
df<-data.frame(col1=c(rep(letters[1:3],3),"d"),col2=c(rep("moose|cat|dog",9),"rock"), stringsAsFactors = FALSE)
df %>% separate_rows(col2) %>% group_by(col2) %>% summarize(total_count_col=n())
# A tibble: 4 x 2
col2 total_count_col
<chr> <int>
1 cat 9
2 dog 9
3 moose 9
4 rock 1

Append 0 to missing observations in a dataframe.

I have a dataset where I expect a fixed number of observations in a data-frame
A 20
B 10
C 5
However, upon running my analysis this is not always the case sometimes I find missing observations and the resulting dataframe looks like this
A 10
C 5
In this case there are no observations for B. I would want to append 0 observations to the final dataframe before ploting so as to indicate the values of the missing observation.
final data frame should look like this
A 10
B 0
C 5
How can I accomplish this in R?
If you define the ID column (with A,B,C) as factor which seems appropriate here, you could plot the data and even those factor levels which are not in the data (but in the defined factor levels) will be plotted. Here's a small example:
df <- data.frame(ID = LETTERS[1:3], x = rnorm(3))
df
# ID x
#1 A 1.350458
#2 B 1.340855
#3 C 1.311329
subdf <- df[c(1,3),]
subdf
# ID x
#1 A 1.350458
#3 C 1.311329
with(subdf, plot(x ~ ID))
You'll find that "B" is also present in the plot although it's not in the subsetted data.
Maybe you can do something with melt and dcast from "reshape2".
Here's what I had in mind:
library(reshape2)
out <- dcast(
melt( # Makes a data.frame from a list
mget(ls(pattern = "df\\d")), # Collects the relevant df in a list
id.vars = "V1"), # The variable to melt by
L1 ~ V1, value.var = "value", fill = 0) # Other options for dcast
out
# L1 A B C
# 1 df1 20 10 5
# 2 df2 10 0 5
From there, you could go back to a long data form.
melt(out, id.vars = "L1")
# L1 variable value
# 1 df1 A 20
# 2 df2 A 10
# 3 df1 B 10
# 4 df2 B 0
# 5 df1 C 5
# 6 df2 C 5
If separate data.frames are required, then you can also look at using split, but if you are just going to be plotting, this format should work just fine.
Sample data
df1 <- structure(list(V1 = c("A", "B", "C"), V2 = c(20L, 10L, 5L)),
.Names = c("V1", "V2"), class = "data.frame",
row.names = c(NA, -3L))
df2 <- structure(list(V1 = c("A", "C"), V2 = c(10L, 5L)),
.Names = c("V1", "V2"), class = "data.frame",
row.names = c(NA, -2L))

Merging different columns with the same name into single columns

I having a data.frame in which some columns have the same Name. Now I want to merge/add up these columns into single columns. So for example I want to turn....
v1 v1 v1 v2 v2
1 0 2 4 1
3 1 1 1 0
...into...
v1 v2
3 5
5 1
I only found threads dealing with two data.frames supposed to be merged into one but none dealing with this (rather simple?) problem.
The data can be recreated with this:
df <- structure(list(v1 = c(1L, 3L), v1 = 0:1, v1 = c(2L, 1L),
v2 = c(4L, 1L), v2 = c(1L, 0L)),
.Names = c("v1", "v1", "v1", "v2", "v2"),
class = "data.frame", row.names = c(NA, -2L))
as.data.frame(lapply(split.default(df, names(df)), function(x) Reduce(`+`, x)))
produces:
v1 v2
1 3 5
2 5 1
split.default(...) breaks up the data frame into groups with equal column names, then we use Reduce on each of those groups to sum the values of each column in the group iteratively until there is only one column left per group (see ?Reduce, that is what the function does), and finally we convert back to data frame with as.data.frame.
We have to use split.default because split (or really, split.data.frame, which it will dispatch) splits on rows, not columns.
You can do this quite easily with melt and dcast from "reshape2". Since there's no "id" variable, I've used melt(as.matrix(df)) instead of melt(df, id.vars="id"). This automatically creates a long version of your data that has "Var1" as representing your rownames and "Var2" as your colnames. Using that knowledge, you can do:
library(reshape2)
dcast(melt(as.matrix(df)), Var1 ~ Var2,
value.var = "value", fun.aggregate=sum)
# Var1 v1 v2
# 1 1 3 5
# 2 2 5 1

Resources