Sum rows with specific criteria in r - r

I am having data as
function person
1 hr 1
2 sls 5
3 mktg 3
4 qlt 7
5 rev 5
I want to make a row with sum of value in column "function" as "sls" and "mktg" using r programing
desired output is :
Person function
1 1 hr
2 8 sls & mktg
3 7 qlt
4 5 rev

A base R solution:
merg <- c("sls", "mktg")
dat$func[dat$func %in% merg] <- paste(merg, collapse = " & ")
aggregate(person ~ func, dat, sum)
func person
1 hr 1
2 qlt 7
3 rev 5
4 sls & mktg 8
Data
dat <- data.frame(
func = c("hr", "sls", "mktg", "qlt", "rev"),
person = c(1, 5, 3, 7, 5),
stringsAsFactors = FALSE
)
Note that this assumes dat$func is a character... if it is not first convert to character with as.character()

library(dplyr)
dat <- data.frame(func = c("hr", "sls", "mktg", "qlt", "rev"),
person = c(1, 5, 3, 7, 5))
dat %>%
mutate(func = func %>% as.factor() %>% as.character(),
func = ifelse(func %in% c("sls", "mktg"), "sls & mktg", func)) %>%
group_by(func) %>%
summarize(Person = sum(person))
returns
# A tibble: 4 x 2
func Person
<chr> <dbl>
1 hr 1
2 qlt 7
3 rev 5
4 sls & mktg 8

Another approach with dplyr:
Code:
dfr %>%
group_by(Function = sub("sls|mktg", "sls & mktg", functn)) %>%
summarise(Person = sum(person))
Output:
# A tibble: 4 x 2
Function Person
<chr> <dbl>
1 hr 1.
2 qlt 7.
3 rev 5.
4 sls & mktg 8.
Data
tringsAsFactors = TRUE|FALSE - works in both cases.
dfr <- data.frame(
functn = c("hr", "sls", "mktg", "qlt", "rev"),
person = c(1, 5, 3, 7, 5)
)

Related

R group_by and filter with multiple conditions

I have a dataset like the following:
ID <- c(1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5)
LABEL <- c('TT', 'CH', 'AB', 'TT', 'CH', 'AB', 'TT', 'CH', 'TT', 'CH', 'AB')
VALUE <- c(2121, 32, 1, 2121, 32, 2, 2121, 99, 2222, 32, 9)
DATA <- data.frame(ID, LABEL, VALUE)
I'm trying to group the data by ID (it's a long file where ID replicates) then filter where the same ID meets multiple conditions. I've tried the following:
X <- DATA %>%
group_by(ID) %>%
filter(
(LABEL == "TT" & VALUE == "2121") &
(LABEL == "CH" & VALUE == "32") )
I want to get a dataset with only the observations with ID 1 and 2 which meet both conditions simultaneously.
Any help is appreciated.
We can use any - after grouping by 'ID', if we want to check for presence of the strings in two columns, wrap with any as multiple expression cannot be TRUE for the same location. Then, we use & so that if only both are present it returns TRUE (any - returns a single TRUE/FALSE)
library(dplyr)
DATA %>%
group_by(ID) %>%
filter(
any((LABEL == "TT" & VALUE == "2121") &
any((LABEL == "CH" & VALUE == "32") ))) %>%
ungroup
-output
# A tibble: 6 × 3
ID LABEL VALUE
<dbl> <chr> <dbl>
1 1 TT 2121
2 1 CH 32
3 1 AB 1
4 2 TT 2121
5 2 CH 32
6 2 AB 2
Or slighly more compact option by pasteing the columns and then check if all the elements in the lhs of %in% are TRUE for filtering the groups
library(stringr)
DATA %>%
group_by(ID) %>%
filter(all(c("TT2121", "CH32") %in% str_c(LABEL, VALUE))) %>%
ungroup
-output
# A tibble: 6 × 3
ID LABEL VALUE
<dbl> <chr> <dbl>
1 1 TT 2121
2 1 CH 32
3 1 AB 1
4 2 TT 2121
5 2 CH 32
6 2 AB 2

Add a column with single value per group

i have a grouped tibble with several columns. i now want to add a new column that has the same value for every row within a group but a different value for each group, basically giving the groups names. these per group values are supplied from a vector.
ideally i want to do this in generic way, so it works in a function based on the number of groups the input has.
any help would be much appreciated, here is a very basic and reduced example of the tibble and vector. (the original tibble has character, int, and dbl columns)
df <- tibble(a = c(1,2,3,1,3,2)) %>% group_by(a)
names <- c("owl", "newt", "zag")
desired_output <– tibble(a = c(1, 2, 3, 1, 3, 2),
name = c("owl", "newt", "zag", "owl", "zag", "newt"))
as the output i would like to have the same tibble just with another column for all in group 1 = owl, 2 = newt, and 3 = zag
Just take a as indices:
library(dplyr)
df %>%
mutate(name = names[a])
# # A tibble: 6 × 2
# a name
# <dbl> <chr>
# 1 1 owl
# 2 2 newt
# 3 3 zag
# 4 1 owl
# 5 3 zag
# 6 2 newt
You can also use recode() if a cannot be used as indices.
df %>%
mutate(name = recode(a, !!!setNames(names, 1:3)))
Data
df <- tibble(a = c(1,2,3,1,3,2))
names <- c("owl", "newt", "zag")
Something like this?
library(dplyr)
names = c("owl", "newt", "zag")
df %>%
group_by(a) %>%
mutate(new_col = case_when(a == 1 ~ names[1],
a == 2 ~ names[2],
a == 3 ~ names[3]))
a new_col
<dbl> <chr>
1 1 owl
2 2 newt
3 3 zag
4 1 owl
5 2 newt
6 3 zag
7 2 newt
8 3 zag
9 1 owl
10 2 newt
11 1 owl
12 3 zag
13 2 newt
14 3 zag
data:
df <- structure(list(a = c(1, 2, 3, 1, 2, 3, 2, 3, 1, 2, 1, 3, 2, 3
)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-14L))
You could use factor()with mutate()
names = c("owl", "newt", "zag")
dat = data.frame(a = c(1,2,3,1,2,3,2,3,1,2,1,3,2,3))
dat %>% mutate(label = factor(a, levels = c(1,2,3), labels = names))
Just make sure the order in levels corresponds to the order in labels (i.e 1 = "owl")

Automating a loop in igraph using tidygraph

Hello and hope all goes well.
I made an edit to my previous question and hope it makes it more clear.
I created an igraph object and would like to run same analysis several times and extract some information in each iteration.
I can't share the whole data, so I am sharing just a small subset.
df_edge is as follows:
library(dplyr)
job_1 <-c(1,2,6,6,5,6,7,8,6,8,8,6,6,8)
job_2 <- c(2,4,5,8,3,1,4,6,1,7,3,2,4,5)
weight <- c(1,1,1,2,1,1,2,1,1,1,2,1,1,1)
df_edge <- tibble(job_1,job_2,weight)
df_edge %>% glimpse()
Rows: 14
Columns: 3
$ job_1 <dbl> 1, 2, 6, 6, 5, 6, 7, 8, 6, 8, 8, 6, 6, 8
$ job_2 <dbl> 2, 4, 5, 8, 3, 1, 4, 6, 1, 7, 3, 2, 4, 5
$ weight <dbl> 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 1, 1, 1
df_node is as follows:
job_id <- c(1,2,3,4,5,6,7,8)
job_type <- c(1,2,0,0,3,1,1,1)
df_node <- tibble(job_id,job_type)
df_node %>% glimpse()
Rows: 8
Columns: 2
$ job_id <dbl> 1, 2, 3, 4, 5, 6, 7, 8
$ job_type <dbl> 1, 2, 0, 0, 3, 1, 1, 1
Creating the igraph object:
library(igraph)
library(tidygraph)
tp_network_subset <- graph.data.frame(df_edge,vertices = df_node,directed = F)
summary of job_type column in the df_node
df_node %>%
count(job_type)
A tibble: 4 x 2
job_type n
<dbl> <int>
1 0 2
2 1 4
3 2 1
4 3 1
What I am doing manually is the following:
### finding a job_id that belongs to job_type==1 category
df_node %>% filter(job_type==1) %>%
select(job_id)
A tibble: 4 x 1
job_id
<dbl>
1 1
2 6
3 7
4 8
# for instance, I picked one of them and it is job_id = 6
### using the job_id to create a subgraph by selecting order 1 neighbors of this job_id (6)
node_test <- make_ego_graph(tp_network_subset,order = 1 ,nodes="6")
### creating a dataframe of this subgrapgh where there is no isolated nodes
df_test <- as_tbl_graph(node_test[[1]]) %>%
activate(nodes) %>%
filter(!node_is_isolated()) %>%
as_tibble()
df_test %>% glimpse()
Rows: 6
Columns: 2
$ name <chr> "1", "2", "4", "5", "6", "8"
$ job_type <dbl> 1, 2, 0, 3, 1, 1
## subgraph size is 6 which will be an outcome of interest
### if the graph is zero length , I should stop here and pick another job_id that belongs to job_type==1 category
In this example, the graph in not zero length so I proceed to the next step
### calculating the measure of interest in respect to job_type==1 category
df_test %>%
summarise(job_rate= (nrow(df_test %>% filter(job_type==1)))/(nrow(df_test %>%
filter(job_type %in% c(1,2,3)))))
# 0.6
if job_rate > 0.5 , I want to keep the job_rate and rows (corresponding nodes) of the job_type=4 category of the subgraph. in this instance, job_rate was 0.6 so I am keeping the following
df_final <- as_tbl_graph(node_test[[1]]) %>%
activate(nodes) %>%
filter(!node_is_isolated()) %>%
as_tibble() %>% filter(job_type==0)
# A tibble: 1 x 2
name job_type
<chr> <dbl>
1 4 0
But, I need to assign their corresponding job__rate and some other related columns. So, my favorite outcome would be
name job_type subgraph_origin_id job_rate subgraph_size no_(job_type==0)_in_subgrapgh no_(job_type==1)_in_subgrapgh no_(job_type==2)_in_subgrapgh no_(job_type==3)_in_subgrapgh
<chr> <dbl>
1 4 0 6 0.6 6
so, I need to do this process and create subgrapghs for all job_type==1 nodes. If the grapgh is not zero length and its job_rate > 0.5 then extract all the corresponding nodes in that subgrapgh along with the job_rate and other columns shown in the favorite outcome.
Does this work for you?
dflst <- split(df_node, job_type)
tpe <- as.numeric(names(dflst))
out <- tibble()
for (i in seq_along(dflst)) {
df <- dflst[[i]]
node_test_lst <- make_ego_graph(tp_network_subset, order = 1, nodes = df$job_id)
origin_id <- df$job_id
jtpe <- tpe[i]
for (j in seq_along(node_test_lst)) {
node_test <- node_test_lst[[j]]
df_test <- as_tbl_graph(node_test) %>%
activate(nodes) %>%
filter(!node_is_isolated()) %>%
as_tibble()
if (nrow(df_test %>% filter(job_type == 0)) > 0 & any(df_test$job_type %in% 1:3)) {
job_rate <- with(df_test, sum(job_type == jtpe) / sum(job_type %in% 1:3))
if (job_rate > 0.5) {
df_final <- df_test %>%
filter(job_type == 0) %>%
mutate(
subgraph_origin_id = origin_id[j],
job_rate = job_rate,
subgraph_size = nrow(df_test)
) %>%
cbind(
setNames(
as.list(table(factor(df_test$job_type, levels = 0:3))),
sprintf("no_(job_type==%s)_in_subgrapgh", 0:3)
)
)
out <- out %>% rbind(df_final)
}
}
}
}
which gives
> out
name job_type subgraph_origin_id job_rate subgraph_size
1 4 0 6 0.60 6
2 4 0 7 1.00 3
3 3 0 8 0.75 5
no_(job_type==0)_in_subgrapgh no_(job_type==1)_in_subgrapgh
1 1 3
2 1 2
3 1 3
no_(job_type==2)_in_subgrapgh no_(job_type==3)_in_subgrapgh
1 1 1
2 0 0
3 0 1

Get most frequent value(s) from a list of lists

I am trying to convert an LDA prediction result, which is a list object containing hundred of list (of topics (in numeric) assigned to each token in a document), such as the following example
assignments <- list(
as.integer(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 3, 3)),
as.integer(c(1, 1, 1, 2, 2, 2, 2, 2, 3, 3)),
as.integer(c(1, 3, 3, 3, 3, 3, 3, 2, 2))
)
where each list of the list object has different length corresponding to the length of each tokenized document.
What I want to do are to 1) get the most frequent topic (1, 2, 3) out of each list, and 2) convert them into tbl or data.frame format like this
document topic freq
1 1 6
2 2 5
3 3 6
such that I can use inner_join() to merge this "consensus" prediction with the topic assignment results generated by tm or topicmodels applications and compare their precision, etc. Since the assignments is in list format, I cannot apply top_n() function to get the most frequent topic for each list. I tried sing lapply(unlist(assignments), count), but it didn't give me what I want.
You can iterate over the list with sapply, get frequency with table and extract first value from sorted result:
result <- sapply(assignments, function(x) sort(table(x), decreasing = TRUE)[1])
data.frame(document = seq_along(assignments),
topic = as.integer(names(result)),
freq = result)
document topic freq
1 1 1 6
2 2 2 5
3 3 3 6
We can loop through the list, get the frequency of elements with tabulate, find the index of maximum elements, extract those along with the frequency as a data.frame and rbind the list elements
do.call(rbind, lapply(seq_along(assignments), function(i) {
x <- assignments[[i]]
ux <- unique(x)
i1 <- tabulate(match(x, ux))
data.frame(document = i, topic = ux[which.max(i1)], freq = max(i1))})
)
# document topic freq
#1 1 1 6
#2 2 2 5
#3 3 3 6
Or another option is to convert it to a two column dataset and then do group by to find the index of max values
library(data.table)
setDT(stack(setNames(assignments, seq_along(assignments))))[,
.(freq = .N), .(document = ind, topic = values)][, .SD[freq == max(freq)], document]
# document topic freq
#1: 1 1 6
#2: 2 2 5
#3: 3 3 6
Or we can use tidyverse
library(tidyverse)
map(assignments, as_tibble) %>%
bind_rows(.id = 'document') %>%
count(document, value) %>%
group_by(document) %>%
filter(n == max(n)) %>%
ungroup %>%
rename_at(2:3, ~c('topic', 'freq'))
# A tibble: 3 x 3
# document topic freq
# <chr> <int> <int>
#1 1 1 6
#2 2 2 5
#3 3 3 6
using purrr::imap_dfr :
library(tidyverse)
imap_dfr(assignments,~ tibble(
document = .y,
Topic = names(which.max(table(.x))),
freq = max(tabulate(.x))))
# # A tibble: 3 x 3
# document Topic freq
# <int> <chr> <int>
# 1 1 1 6
# 2 2 2 5
# 3 3 3 6

R - Find a sequence of row elements based on time constraints in a dataframe

Consider the following dataframe (ordered by id and time):
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,32,1,2,6,17,24))
df
id event time
1 1 a 1
2 1 b 3
3 1 b 6
4 1 b 12
5 1 a 24
6 1 b 30
7 1 a 42
8 2 a 1
9 2 a 2
10 2 b 6
11 2 a 17
12 2 a 24
I want to count how many times a given sequence of events appears in each "id" group. Consider the following sequence with time constraints:
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
It means that event "a" can start at any time, event "b" must start no earlier than 2 and no later than 8 after event "a", another event "a" must start no earlier than 12 and no later than 18 after event "b".
Some rules for creating sequences:
Events don't need to be consecutive with respect to "time" column. For example, seq can be constructed from rows 1, 3, and 5.
To be counted, sequences must have different first event. For example, if seq = rows 8, 10, and 11 was counted, then seq = rows 8, 10, and 12 must not be counted.
The events may be included in many constructed sequences if they do not violate the second rule. For example, we count both sequences: rows 1, 3, 5 and rows 5, 6, 7.
The expected result:
df1
id count
1 1 2
2 2 2
There are some related questions in R - Identify a sequence of row elements by groups in a dataframe and Finding rows in R dataframe where a column value follows a sequence.
Is it a way to solve the problem using "dplyr"?
I believe this is what you're looking for. It gives you the desired output. Note that there is a typo in your original question where you have a 32 instead of a 42 when you define the time column in df. I say this is a typo because it doesn't match your output immediately below the definition of df. I changed the 32 to a 42 in the code below.
library(dplyr)
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,42,1,2,6,17,24))
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
df %>%
full_join(df,by='id',suffix=c('1','2')) %>%
full_join(df,by='id') %>%
rename(event3 = event, time3 = time) %>%
filter(event1 == seq[1] & event2 == seq[2] & event3 == seq[3]) %>%
filter(time1 %>% between(time_LB[1],time_UB[1])) %>%
filter((time2-time1) %>% between(time_LB[2],time_UB[2])) %>%
filter((time3-time2) %>% between(time_LB[3],time_UB[3])) %>%
group_by(id,time1) %>%
slice(1) %>% # slice 1 row for each unique id and time1 (so no duplicate time1s)
group_by(id) %>%
count()
Here's the output:
# A tibble: 2 x 2
id n
<dbl> <int>
1 1 2
2 2 2
Also, if you omit the last 2 parts of the dplyr pipe that do the counting (to see the sequences it is matching), you get the following sequences:
Source: local data frame [4 x 7]
Groups: id, time1 [4]
id event1 time1 event2 time2 event3 time3
<dbl> <fctr> <dbl> <fctr> <dbl> <fctr> <dbl>
1 1 a 1 b 6 a 24
2 1 a 24 b 30 a 42
3 2 a 1 b 6 a 24
4 2 a 2 b 6 a 24
EDIT IN RESPONSE TO COMMENT REGARDING GENERALIZING THIS: Yes it is possible to generalize this to arbitrary length sequences but requires some R voodoo. Most notably, note the use of Reduce, which allows you to apply a common function on a list of objects as well as foreach, which I'm borrowing from the foreach package to do some arbitrary looping. Here's the code:
library(dplyr)
library(foreach)
df <- data.frame(id = c(rep(1,7),rep(2,5)), event = c("a","b","b","b","a","b","a","a","a","b","a","a"), time = c(1,3,6,12,24,30,42,1,2,6,17,24))
seq <- c("a", "b", "a")
time_LB <- c(0, 2, 12)
time_UB <- c(Inf, 8, 18)
multi_full_join = function(df1,df2) {full_join(df1,df2,by='id')}
df_list = foreach(i=1:length(seq)) %do% {df}
df2 = Reduce(multi_full_join,df_list)
names(df2)[grep('event',names(df2))] = paste0('event',seq_along(seq))
names(df2)[grep('time',names(df2))] = paste0('time',seq_along(seq))
df2 = df2 %>% mutate_if(is.factor,as.character)
df2 = df2 %>%
mutate(seq_string = Reduce(paste0,df2 %>% select(grep('event',names(df2))) %>% as.list)) %>%
filter(seq_string == paste0(seq,collapse=''))
time_diff = df2 %>% select(grep('time',names(df2))) %>%
t %>%
as.data.frame() %>%
lapply(diff) %>%
unlist %>% matrix(ncol=2,byrow=TRUE) %>%
as.data.frame
foreach(i=seq_along(time_diff),.combine=data.frame) %do%
{
time_diff[[i]] %>% between(time_LB[i+1],time_UB[i+1])
} %>%
Reduce(`&`,.) %>%
which %>%
slice(df2,.) %>%
filter(time1 %>% between(time_LB[1],time_UB[1])) %>% # deal with time1 bounds, which we skipped over earlier
group_by(id,time1) %>%
slice(1) # slice 1 row for each unique id and time1 (so no duplicate time1s)
This outputs the following:
Source: local data frame [4 x 8]
Groups: id, time1 [4]
id event1 time1 event2 time2 event3 time3 seq_string
<dbl> <chr> <dbl> <chr> <dbl> <chr> <dbl> <chr>
1 1 a 1 b 6 a 24 aba
2 1 a 24 b 30 a 42 aba
3 2 a 1 b 6 a 24 aba
4 2 a 2 b 6 a 24 aba
If you want just the counts, you can group_by(id) then count() as in the original code snippet.
Perhaps it's easier to represent event sequences as strings and use regex:
df.str = lapply(split(df, df$id), function(d) {
z = rep('-', tail(d,1)$time); z[d$time] = as.character(d$event); z })
df.str = lapply(df.str, paste, collapse='')
# > df.str
# $`1`
# [1] "a-b--b-----b-----------a-----b-----------a"
#
# $`2`
# [1] "aa---b----------a------a"
df1 = lapply(df.str, function(s) length(gregexpr('(?=a.{1,7}b.{11,17}a)', s, perl=T)[[1]]))
> data.frame(id=names(df1), count=unlist(df1))
# id count
# 1 1 2
# 2 2 2

Resources