manipulate data with multiple values in the cell in r - r

I have a data-set looks like
universityies <- c("UNI.1;UNI.1;UNI.2;UNI.3","UNI.5", "UNI.3;UNI.4" )
papers <- c(1,1,1)
cited <- c(10,5,20)
df <- data.frame(universityies, papers, cited )
df
and I want to get something like
#total papers total cited
#UNI.1 1 10
#UNI.2 1 10
#UNI.3 2 30
#UNI.4 1 20
#UNI.5 1 5
many thanks in advance,

We can split the data on ";", get unique rows, group_by universityies count distinct papers and the total number of citations.
library(dplyr)
df %>%
mutate(row = row_number()) %>%
tidyr::separate_rows(universityies, sep = ";") %>%
distinct() %>%
group_by(universityies) %>%
summarise(total_papers = n_distinct(row),
total_cited = sum(cited))
# universityies total_papers total_cited
# <chr> <int> <dbl>
#1 UNI.1 1 10
#2 UNI.2 1 10
#3 UNI.3 2 30
#4 UNI.4 1 20
#5 UNI.5 1 5

We can use cSplit from splitstackshape and data.table methods
library(data.table)
library(splitstackshape)
unique(cSplit(setDT(df, keep.rownames = TRUE), "universityies", ";",
"long"))[, .(total_papers = uniqueN(rn), total_cited = sum(cited)),.(universityies)]
# universityies total_papers total_cited
#1: UNI.1 1 10
#2: UNI.2 1 10
#3: UNI.3 2 30
#4: UNI.5 1 5
#5: UNI.4 1 20

You may use strsplit in a first step, then aggregate
tmp <- do.call(rbind, apply(df, 1, function(x)
setNames(data.frame(strsplit(x[1], ";"), as.numeric(x[2]), as.numeric(x[3]),
row.names=NULL, stringsAsFactors=FALSE), names(df))))
res <- aggregate(cbind(total.papers=papers, total.cited=cited) ~ universityies,
unique(tmp), sum)
res[order(res$universityies), ]
# universityies total.papers total.cited
# 1 UNI.1 1 10
# 2 UNI.2 1 10
# 3 UNI.3 2 30
# 4 UNI.4 1 20
# 5 UNI.5 1 5

Related

R Extract specific text from column into multiple columns [duplicate]

This question already has answers here:
Extracting numbers from vectors of strings
(12 answers)
Closed 10 months ago.
I have a dataframe exported from the web with this format
id vals
1 {7,12,58,1}
2 {1,2,5,7}
3 {15,12}
I would like to extract ONLY the numbers (ignore curlys and commas) into multiple columns like this
id val_1 val_2 val_3 val_4 val_5
1 7 12 58 1
2 1 2 5 7
3 15 12
Even though the Max of values we got was 4 I want to always go up to value val_5.
Thanks!
We could use str_extract_all for this:
library(dplyr)
library(stringr)
df %>%
mutate(vals = str_extract_all(vals, '\\d+', ''))
or as #akrun suggest in the comments
df %>%
mutate(vals = str_extract_all(vals, '\\d+', '')) %>%
do.call(data.frame, .)
id vals.1 vals.2 vals.3 vals.4
1 1 7 12 58 1
2 2 1 2 5 7
3 3 15 12 <NA> <NA>
data:
df <- structure(list(id = 1:3, vals = c("{7,12,58,1}", "{1,2,5,7}",
"{15,12}")), class = "data.frame", row.names = c(NA, -3L))
Another possible tidyverse option, where we remove the curly brackets, then separate the rows on the ,, then pivot to wide form. Then, we can create the additional column (using add_column from tibble) based on the max value in the column names (which is 4 in this case), and then can create val_5.
library(tidyverse)
df %>%
mutate(vals = str_replace_all(vals, "\\{|\\}", "")) %>%
separate_rows(vals, sep=",") %>%
group_by(id) %>%
mutate(ind = row_number()) %>%
pivot_wider(names_from = ind, values_from = vals, names_prefix = "val_") %>%
add_column(!!(paste0("val_", parse_number(names(.)[ncol(.)])+1)) := NA)
Output
id val_1 val_2 val_3 val_4 val_5
1 1 7 12 58 1 NA
2 2 1 2 5 7 NA
3 3 15 12 <NA> <NA> NA
Data
df <- read.table(text = "id vals
1 {7,12,58,1}
2 {1,2,5,7}
3 {15,12} ", header = T)
Using data.table
library(data.table)
library(stringi)
result <- setDT(df)[, stri_match_all_regex(vals, '\\d+')[[1]], by=.(id)]
result[, item:=paste('val', 1:.N, sep='_'), by=.(id)] # defines column names
dcast(result, id~item, value.var = 'V1') # convert from long to wide
## id val_1 val_2 val_3 val_4
## 1: 1 7 12 58 1
## 2: 2 1 2 5 7
## 3: 3 15 12 <NA> <NA>

break long characters by "-" and identify the unique components by group

Suppose you have a transaction data set with products purchased by customers. You know the total skus (which unique product they buy for each order). But you want to know the unique sku for the lifetime of the user. Say, I buy "apple" for the first time ever today, then apple is a NEW sku. But when I buy "apple" amd "banana" again. Then apple is not a new sku but banana is (if first time purchase).
data
user_id<-c(1,1,1,2,3,4,4)
order_date<-c("2/9/2016",
"11/19/2015",
"12/30/2016",
"9/27/2016",
"12/10/2016",
"11/5/2016",
"1/1/2017")
sku<-c("262-264-280","280-123","510","6251-16990","9227-14572","9227-14572","280")
dt<-data.frame(user_id,order_date,sku)
Output
update: I typed "user_id" as "order_id"
A data.table possibility, using the strings split by -, and checking for new unique values in each row via a set union and Reduce(..., accumulate=TRUE). The count of new values is then a difference between each successive row:
library(data.table)
setDT(dt)
dt[, sku := as.character(sku)]
dt[,
total := lengths(Reduce(union, strsplit(sku, "\\-"), accumulate=TRUE)),
by=user_id
]
dt[, new := c(total[1], diff(total)), by=user_id]
dt
# user_id order_date sku total new
#1: 1 2/9/2016 262-264-280 3 3
#2: 1 11/19/2015 280-123 4 1
#3: 1 12/30/2016 510 5 1
#4: 2 9/27/2016 6251-16990 2 2
#5: 3 12/10/2016 9227-14572 2 2
#6: 4 11/5/2016 9227-14572 2 2
#7: 4 1/1/2017 280 3 1
A possible tidyverse way:
library(dplyr)
library(tidyr)
dt %>%
separate_rows(sku, sep = "-") %>%
mutate(order_date = as.Date(as.character(order_date), "%m/%d/%Y")) %>%
group_by(order_id, sku) %>%
arrange(order_id, order_date) %>%
mutate(idx = row_number() * (row_number() == 1)) %>%
group_by(order_id, order_date) %>%
summarise(sku = paste(sku, collapse = "-"),
`number of new sku purchase` = sum(idx)) %>%
group_by(order_id) %>%
mutate(`total number of sku` = cumsum(`number of new sku purchase`))
Here's a base solution that is similar to #thelatemail:
dt$sku <- as.character(dt$sku)
dt$n_skus <- ave(dt$sku
, dt$user_id
, FUN = function (sku_f) {
sapply(
Reduce(union, strsplit(sku_f, '-', fixed = T), accumulate = T)
, length)
})
dt$n_skus <- as.integer(dt$n_skus)
dt$n_new <- ave(dt$n_skus
, dt$user_id
, FUN = function(n) c(n[1], diff(n)))
dt
user_id order_date sku n_skus n_new
1 1 2/9/2016 262-264-280 3 3
2 1 11/19/2015 280-123 4 1
3 1 12/30/2016 510 5 1
4 2 9/27/2016 6251-16990 2 2
5 3 12/10/2016 9227-14572 2 2
6 4 11/5/2016 9227-14572 2 2
7 4 1/1/2017 280 3 1
And here's a crazy, inefficient data.table solution that doesn't include the original sku field:
library(data.table)
setDT(dt)
dt[, strsplit(as.character(sku), '-'), by = .(user_id, order_date)
][, .SD[!duplicated(V1), .(n_new = .N), by = order_date], by = .(user_id)
][, n_total := cumsum(n_new), by = .(user_id)][]
Performance
#thelatemail's should scale up the best.
Unit: microseconds
expr min lq mean median uq max neval
argonaut_dplyr 10020.601 10247.851 10720.0709 10474.451 10770.751 16021.3 100
thelatemail_dt 1954.501 2072.101 2385.8019 2303.001 2436.202 5807.0 100
#base is always fastest with 7 rows, it would be outstripped by #thelatemail very quickly
cole_base 651.501 751.601 834.0299 772.201 794.752 6309.2 100
cole_bad_dt 7006.400 7355.200 7757.9991 7668.401 7958.651 12708.0 100
purrr_A_sul 14575.501 14862.552 15566.4809 15033.201 15401.601 33263.3 100
Here is one option based on unlist(strsplit(dt$sku,'-')) the current sku then compare it with the previous skus
library(dplyr)
library(purrr)
library(tidyr)
dt %>%
nest(-user_id) %>%
mutate(NNSP = map(data, ~map_dbl(1:length(.x$sku), function(y) {
#browser()
ynow <- unlist(strsplit(as.character(.x$sku)[y],'-'))
yprev <- unique(unlist(strsplit(as.character(.x$sku[1:y-1]),'-')))
length(ynow[!ynow %in% yprev])
})),
TotNNSP = map(NNSP, ~cumsum(.x))) %>%
unnest()
# A tibble: 7 x 5
user_id NNSP TotNNSP order_date sku
<dbl> <dbl> <dbl> <fct> <fct>
1 1 3 3 2/9/2016 262-264-280
2 1 1 4 11/19/2015 280-123
3 1 1 5 12/30/2016 510
4 2 2 2 9/27/2016 6251-16990
5 3 2 2 12/10/2016 9227-14572
6 4 2 2 11/5/2016 9227-14572
7 4 1 3 1/1/2017 280
Using #thelatemail's Reduce and lengths combination, we can do:
library(dplyr)
setdiff2 <- function(x, y) y[!y %in% x]
dt %>%
group_by(user_id) %>%
mutate(sku = as.character(sku),
NNSP = lengths(Reduce(setdiff2, strsplit(sku,'-'), accumulate = TRUE)),
Tot_NNSP = cumsum(NNSP))

convert data frame of "missed" numbers into data frame of numbers "hit"

I have quite an specific doubt, but it should be easy to solve, I just cannot think how...
I have a simple data frame like this:
mydf <- data.frame(Shooter=1:3, Targets.missed=c(paste(sample(1:10,4),collapse=";"), paste(sample(1:10,5),collapse=";"), paste(sample(1:10,8),collapse=";")))
mydf
Shooter Targets.missed
1 1 3;8;4;7
2 2 10;1;5;7;4
3 3 5;9;4;10;8;1;6;7
This data frame tells me the Targets (from 1 to 10) that are missed by each Shooter.
I would like to obtain a different data frame that tells me, per Target, which Shooter\s made it.
The result would be:
Target hit.by.Shooters
1 1
2 1;2;3
3 2;3
4 NA
5 1
6 1;2
7 NA
8 2
9 1;2
10 1
We expand the data by splitting at the ; of the 'Targets.missed' into 'long' format, then grouped by 'Shooter', summarise with a list of numbers that are not in the 'Targets.missed' from 1:10, unnest the list column, grouped by 'Target', summarise by pasteing the unique 'Shooter' elements into a single string, and fill the missing elements from 1:10 with NA by using complete
library(tidyverse)
mydf %>%
separate_rows(Targets.missed) %>%
group_by(Shooter) %>%
summarise(Target = list(setdiff(1:10, Targets.missed))) %>%
unnest %>%
group_by(Target) %>%
summarise(hit.by.Shooters = paste(unique(Shooter), collapse=";")) %>%
complete(Target = 1:10)
# A tibble: 10 x 2
# Target hit.by.Shooters
# <int> <chr>
# 1 1 1
# 2 2 1;2;3
# 3 3 2;3
# 4 4 <NA>
# 5 5 1
# 6 6 1;2
# 7 7 <NA>
# 8 8 2
# 9 9 1;2
#10 10 1
Or another option is base R by splitting the 'Targets.missed' (assuming character class) into a list of vectors, loop through the list, get the values that are not in 1:10 (with setdiff), set the names of the list with the 'Shooter' column, stack the key/val list pairs into a two column data.frame, get the unique rows, aggregate by pasteing the 'ind' column grouped by 'values', merge with a full 'values' dataset from 1:10
out <- aggregate(ind ~ values,
unique(stack(setNames(lapply(strsplit(mydf$Targets.missed, ';'),
setdiff, x= 1:10), mydf$Shooter))), FUN = paste, collapse=";")
out1 <- merge(data.frame(values = 1:10), out, all.x = TRUE)
and change the column names if necessary
names(out1) <- c('Target', 'hit.by.Shooters')
data
mydf <- structure(list(Shooter = 1:3, Targets.missed = c("3;8;4;7", "10;1;5;7;4",
"5;9;4;10;8;1;6;7")), class = "data.frame", row.names = c("1",
"2", "3"))
Another tidyverse possibility. We first create dataframe with all possible combinations of Shooter and Targets and then remove rows which are present in mydf using anti_join, fill in the missing Targets by adding them as NA and finally summarise by Targets to get Shooters who actually hit the target.
library(tidyverse)
crossing(Shooter = unique(mydf$Shooter), Targets.missed = 1:10) %>%
anti_join(mydf %>% separate_rows(Targets.missed) %>% mutate_all(as.numeric)) %>%
complete(Targets.missed = 1:10) %>%
group_by(Targets.missed) %>%
summarise(hit.by.Shooters = paste0(Shooter, collapse = ";"))
# Targets.missed hit.by.Shooters
# <int> <chr>
# 1 1 1;2
# 2 2 1;2
# 3 3 1
# 4 4 1
# 5 5 2
# 6 6 1;3
# 7 7 1;2
# 8 8 2
# 9 9 NA
#10 10 3
data
set.seed(987)
mydf <- data.frame(Shooter=1:3,
Targets.missed=c(paste(sample(1:10,4),collapse=";"),
paste(sample(1:10,5),collapse=";"), paste(sample(1:10,8),collapse=";")))
data.table approach
library( data.table )
#vector with all possible targets
targets.v <- 1:10
#split the missed targets to a list
missed.list <- strsplit( mydf$Targets.missed, ";")
#inverse, to get all hit targets
hit.list <- lapply( missed.list, function(x) as.data.table( targets.v[!targets.v %in% x] ) )
#bind hit targets to data.table
dt <- rbindlist( hit.list, idcol = "shooter" )
#summarise (paste with collapse), and join on all possible targets
dt[, .(hit.by.shooters = paste(shooter, collapse = ";")), by = .(target = V1)][data.table(target = targets.v), on = c("target")]
# target hit.by.shooters
# 1: 1 1
# 2: 2 1;2;3
# 3: 3 2;3
# 4: 4 <NA>
# 5: 5 1
# 6: 6 1;2
# 7: 7 <NA>
# 8: 8 2
# 9: 9 1;2
# 10: 10 1

Finding mean of every x observations for list of dataframes

I'm trying to follow this SO post: Calculate the mean of every 13 rows in data frame, but for some reason it's not working correctly on my end. Their example works fine:
df <- data.frame(a=1:12, b=13:24 );
df
n <- 5;
aggregate(df,list(rep(1:(nrow(df)%/%n+1),each=n,len=nrow(df))),mean)[-1];
a b
1 3.0 15.0
2 8.0 20.0
3 11.5 23.5
But mine, using a for loop for over a list of dfs, doesnt:
for (dset in 1:5){
if(dset == 1){n <- 60}
else{n <- 12}#else combine by 12
print(n)
v.ntrade <- aggregate(B.list[[dset]][,7],list(rep(1:(nrow(B.list[[dset]][,7])%/%n+1),each=n,len=nrow(B.list[[dset]][,7]))),sum)
v.volume <- aggregate(B.list[[dset]][,5],list(rep(1:(nrow(B.list[[dset]][,5])%/%n+1),each=n,len=nrow(B.list[[dset]][,5]))),sum)
B.list[[dset]] <- aggregate(B.list[[dset]],list(rep(1:(nrow(B.list[[dset]])%/%n+1),each=n,len=nrow(B.list[[dset]]))),mean)
#replace vol and ntrades
B.list[[dset]][,7] <- v.ntrade[,2]
B.list[[dset]][,5] <- v.volume[,2]
B.list[[dset]] <- B.list[[dset]][,-1] }
Before:
> B.list[[1]][,4]
PAIRclose
1: 8063.21
2: 8065.95
3: 8053.50
4: 8040.00
5: 8054.00
---
75009: 7471.40
75010: 7461.99
75011: 7472.56
75012: 7482.05
75013: 7469.69
After:
> B.list[[1]][,4]
[1] 5698.0203 2257.8796 2886.9289 1812.9951 1521.3267 2305.9228 1103.6083
etc
Is there some weird behavior with the aggregate function? Or is it something with the %/%n+1 that I have no idea what it does.
We can do this with tidyverse. Loop through the list of datasets with map, create a grouping variable with gl and use summarise_all to get the mean of all other columns
library(tidyverse)
lst %>%
map(~ .x %>%
group_by(grp = as.integer(gl(n(), n, n()))) %>%
summarise_all(mean))
#[[1]]
# A tibble: 3 x 3
# grp a b
# <int> <dbl> <dbl>
#1 1 3 15
#2 2 8 20
#3 3 11.5 23.5
#[[2]]
# A tibble: 3 x 3
# grp a b
# <int> <dbl> <dbl>
#1 1 3 15
#2 2 8 20
#3 3 11.5 23.5
Or using base R with lapply and aggregate
lapply(lst, function(x) aggregate(.~ cbind(grp = as.integer(gl(nrow(x),
n, nrow(x)))), x, mean)[-1])
data
lst <- list(df, df)

Finding IDs based on one unique Output Value in R

I have two columns in a dataframe advertisementID and Payout, Many advertisementID's have more than one Payout value, but I need to find those advertisementID's which have only one unique Payout value. How to do it in R ?
Example:
advertisementID Payout
1 10
2 3
1 10
2 4
3 5
3 4
So the output should be like this:
advertisementID Payout
1 10
as advertisementID 1 is having payout value unique which is 10
Using R base:
new <- aggregate(Payout ~ advertisementID, dt, unique)
new[lengths(new$Payout)==1, ]
output:
advertisementID Payout
1 1 10
Or in a cleaner way with magrittr:
library(magrittr)
aggregate(Payout ~ advertisementID, dt, unique) %>% subset(lengths(Payout)==1)
A solution from dplyr.
library(dplyr)
dt2 <- dt %>%
group_by(advertisementID) %>%
filter(n_distinct(Payout) == 1) %>%
distinct(advertisementID, Payout) %>%
ungroup()
dt2
# A tibble: 1 x 2
advertisementID Payout
<int> <int>
1 1 10
DATA
dt <- read.table(text = "advertisementID Payout
1 10
2 3
1 10
2 4
3 5
3 4",
header = TRUE)

Resources