I wish to create a new column in the dataframe below that is contingent on certain strings - in this case, "next section".
library(tidyverse)
set.seed(123)
df1 <- tibble(text = c(sample(fruit, sample(1:3)), "next", "section", sample(fruit, sample(1:3))),
article = "df1")
df2 <- tibble(text = c(sample(fruit, sample(1:3)), "next", "section", sample(fruit, sample(1:3))),
article = "df2")
df3 <- tibble(text = c(sample(fruit, sample(1:3)), "next", "section", sample(fruit, sample(1:3))),
article = "df3")
final_df <- df1 %>%
bind_rows(df2) %>%
bind_rows(df3)
To be clear, this is the output I'd like to achieve:
final_df %>%
mutate(label = c("first","first","first","first","first", "second", "second",
"first","first","first","first","second",
"first","first","first","first","second","second"))
# A tibble: 18 x 3
text article label
<chr> <chr> <chr>
1 cantaloupe df1 first
2 quince df1 first
3 kiwi fruit df1 first
4 next df1 first
5 section df1 first
6 cantaloupe df1 second
7 date df1 second
8 rambutan df2 first
9 passionfruit df2 first
10 next df2 first
11 section df2 first
12 rock melon df2 second
13 blood orange df3 first
14 guava df3 first
15 next df3 first
16 section df3 first
17 strawberry df3 second
18 cherimoya df3 second
I'm thinking I could start with a group_by(article), followed with mutate(label = case_when()) but I'm stuck beyond this. Specifically, how do I populate the rows before and including the strings "next" and "section"?
We can use lag to get text from the previous row and use cumsum to increment the count whenever we observe 'section' in current row and 'next' in previous row for each article.
library(dplyr)
final_df %>%
group_by(article) %>%
mutate(temp = lag(cumsum(text == 'section' & lag(text) == 'next'),
default = 0) + 1)
# text article label
# <chr> <chr> <dbl>
# 1 cantaloupe df1 1
# 2 quince df1 1
# 3 kiwi fruit df1 1
# 4 next df1 1
# 5 section df1 1
# 6 cantaloupe df1 2
# 7 date df1 2
# 8 rambutan df2 1
# 9 passionfruit df2 1
#10 next df2 1
#11 section df2 1
#12 rock melon df2 2
#13 blood orange df3 1
#14 guava df3 1
#15 next df3 1
#16 section df3 1
#17 strawberry df3 2
#18 cherimoya df3 2
The same logic can be translated to data.table using shift.
library(data.table)
setDT(final_df)[, label := shift(cumsum(text == 'section' &
shift(text) == 'next'), fill = 0) + 1, article]
You can replace 1, 2 with 'first', 'second' if you need output in that form.
Related
I have a tidy data.frame in this format:
library(tidyverse)
df = data.frame(name = c("Clarence","Clarence","Clarence","Shelby","Shelby", "Patricia","Patricia"), fruit = c("Apple", "Banana", "Grapes", "Apple", "Apricot", "Banana", "Grapes"))
df
# name fruit
#1 Clarence Apple
#2 Clarence Banana
#3 Clarence Grapes
#4 Shelby Apple
#5 Shelby Apricot
#6 Patricia Banana
#7 Patricia Grapes
I want to compare the overlaps between groups in a pairwise manner (i.e. if both people have an apple that counts as an overlap of 1) so that I end up with a dataframe that looks like this:
df2 = data.frame(names = c("Clarence-Shelby", "Clarence-Patricia", "Shelby-Patricia"), n_overlap = c(1, 2, 0))
df2
# names n_overlap
#1 Clarence-Shelby 1
#2 Clarence-Patricia 2
#3 Shelby-Patricia 0
Is there an elegant way to do this in the tidyverse framework? My real dataset is much larger than this and will be grouped on multiple columns.
If the 0 overlap is not important, a solution is:
> df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
If you really need non-overlapping pairs:
> a = df %>% inner_join(df,by="fruit") %>% filter(name.x<name.y) %>% count(name.x,name.y)
> b = as.data.frame(t(combn(sort(unique(df$name,2)),2)))
> colnames(b)=colnames(a)[1:2]
> a %>% full_join(b) %>% replace_na(list(n=0))
Joining, by = c("name.x", "name.y")
name.x name.y n
1 Clarence Patricia 2
2 Clarence Shelby 1
3 Patricia Shelby 0
Try this,
combinations <- apply(combn(unique(df$name), 2), 2, function(z) paste(sort(z), collapse = "-"))
combinations
# [1] "Clarence-Shelby" "Clarence-Patricia" "Patricia-Shelby"
library(dplyr)
df %>%
group_by(fruit) %>%
summarize(names = paste(sort(unique(name)), collapse = "-")) %>%
right_join(tibble(names = combinations), by = "names") %>%
group_by(names) %>%
summarize(n_overlap = sum(!is.na(fruit)))
# # A tibble: 3 x 2
# names n_overlap
# <chr> <int>
# 1 Clarence-Patricia 2
# 2 Clarence-Shelby 1
# 3 Patricia-Shelby 0
I have this dataframe separate_on_condition with two columns:
separate_on_condition <- data.frame(first = 'a3,b1,c2', second = '1,2,3,4,5,6')`
# first second
# 1 a3,b1,c2 1,2,3,4,5,6
How can I turn it to:
# A tibble: 6 x 2
first second
<chr> <chr>
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
where:
a3 will be separated into 3 rows
b1 into 1 row
c2 into 2 rows
Is there a better way on achieving this instead of using rep() on first column and separate_rows() on the second column?
Any help would be much appreciated!
Create a row number column to account for multiple rows.
Split second column on , in separate rows.
For each row extract the data to be repeated along with number of times it needs to be repeated.
library(dplyr)
library(tidyr)
library(stringr)
separate_on_condition %>%
mutate(row = row_number()) %>%
separate_rows(second, sep = ',') %>%
group_by(row) %>%
mutate(first = rep(str_extract_all(first(first), '[a-zA-Z]+')[[1]],
str_extract_all(first(first), '\\d+')[[1]])) %>%
ungroup %>%
select(-row)
# first second
# <chr> <chr>
#1 a 1
#2 a 2
#3 a 3
#4 b 4
#5 c 5
#6 c 6
You can the following base R option
with(
separate_on_condition,
data.frame(
first = unlist(sapply(
unlist(strsplit(first, ",")),
function(x) rep(gsub("\\d", "", x), as.numeric(gsub("\\D", "", x)))
), use.names = FALSE),
second = eval(str2lang(sprintf("c(%s)", second)))
)
)
which gives
first second
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
Here is an alternative approach:
add NA to first to get same length
use separate_rows to bring each element to a row
use extract by regex digit to split first into first and helper
group and slice by values in helper
do some tweaking
library(tidyr)
library(dplyr)
separate_on_condition %>%
mutate(first = str_c(first, ",NA,NA,NA")) %>%
separate_rows(first, second, sep = "[^[:alnum:].]+", convert = TRUE) %>%
extract(first, into = c("first", "helper"), "(.{1})(.{1})", remove=FALSE) %>%
group_by(second) %>%
slice(rep(1:n(), each = helper)) %>%
ungroup() %>%
drop_na() %>%
mutate(second = row_number()) %>%
select(first, second)
first second
<chr> <int>
1 a 1
2 a 2
3 a 3
4 b 4
5 c 5
6 c 6
This is a simplified version of a problem involving a large list containing complex tables. I want to extract the tables from the list and apply a function to each one. Here we can create a simple list containing small named data frames:
library(tidyverse)
table_names <- c('dfA', 'dfB', 'dfC')
dfA <- tibble(a = 1:3, b = 4:6, c = 7:9)
dfB <- tibble(a = 10:12, b = 13:15, c = 16:18)
dfC <- tibble(a = 19:21, b = 22:24, c = 25:27)
df_list <- list(dfA, dfB, dfC) %>% setNames(table_names)
Here is a simplified example of the kind of operation I would like to apply:
dfA_mod <- df_list$dfA %>%
mutate(name = 'dfA') %>%
select(name, everything())
In this example, I extract one of three tables in the list df_list$dfA, create a new column with the same value in each row mutate(name = 'dfA'), and re-order the columns so that the new column appears in the left-most position select(name, everything()). The resulting object is assigned to dfA_mod.
To solve the larger problem, I want to use one of the purrr::map() variants to apply the function over the character vector table_names, which was initiated in the first block of code above. The elements of table_names serve two purposes: 1) naming the tables held in the list; and 2) supplying values for the name column in the modified table.
I could write a function such as:
fun <- function(x) {
df_list$x %>%
mutate(name = x) %>%
select(name, everything()) %>%
assign(paste0(x, '_mod'), ., envir = .GlobalEnv)
}
And then use map() to create a new list of modified tables:
new_list <- df_list %>% map(table_name, fun(x))
But of course this code does not work, with the main obstacle being (for me at least) figuring out how to quote and unquote the right terms within the function. I'm a beginner at tidy evaluation, and I could use some help in specifying the function and using map properly.
Here is the desired output (for one modified table):
# A tibble: 3 x 4
name a b c
<chr> <int> <int> <int>
1 dfA 1 4 7
2 dfA 2 5 8
3 dfA 3 6 9
Thanks in advance for any help!
We can use purrr::imap which passes data in the list as well as name of the list
library(dplyr)
library(purrr)
df_out <- imap(df_list, ~.x %>% mutate(name = .y) %>% select(name, everything()))
df_out
#$dfA
# A tibble: 3 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfA 1 4 7
#2 dfA 2 5 8
#3 dfA 3 6 9
#$dfB
# A tibble: 3 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfB 10 13 16
#....
#....
This gives a list of desired dataframes, if you want them as separate dataframes, you can do
names(df_out) <- paste0(names(df_out), "_mod")
list2env(df_out, .GlobalEnv)
We can also do it using base R Map
df_out <- Map(function(x, y) transform(x, name = y)[c('name', names(x))],
df_list, names(df_list))
and give list names same as above.
We can convert it to a single data.frame with map while passing the .id
library(purrr)
map_dfr(df_list, I, .id = 'name')
Or with bind_rows
library(dplyr)
bind_rows(df_list, .id = 'name')
# A tibble: 9 x 4
# name a b c
# <chr> <int> <int> <int>
#1 dfA 1 4 7
#2 dfA 2 5 8
#3 dfA 3 6 9
#4 dfB 10 13 16
#5 dfB 11 14 17
#6 dfB 12 15 18
#7 dfC 19 22 25
#8 dfC 20 23 26
#9 dfC 21 24 27
I have two data sets with one common variable - ID (there are duplicate ID numbers in both data sets). I need to link dates to one data set, but I can't use left-join because the first or left file so to say needs to stay as it is (I don't want it to return all combinations and add rows). But I also don't want it to link data like vlookup in Excel which finds the first match and returns it so when I have duplicate ID numbers it only returns the first match. I need it to return the first match, then the second, then third (because the dates are sorted so that the newest date is always first for every ID number) and so on BUT I can't have added rows. Is there any way to do this? Since I don't know how else to show you I have included an example picture of what I need. data joining. Not sure if I made myself clear but thank you in advance!
You can add a second column to create subid's that follow the order of the rownumbers. Then you can use an inner_join to join everything together.
Since you don't have example data sets I created two to show the principle.
df1 <- df1 %>%
group_by(ID) %>%
mutate(follow_id = row_number())
df2 <- df2 %>% group_by(ID) %>%
mutate(follow_id = row_number())
outcome <- df1 %>% inner_join(df2)
# A tibble: 7 x 3
# Groups: ID [?]
ID sub_id var1
<dbl> <int> <fct>
1 1 1 a
2 1 2 b
3 2 1 e
4 3 1 f
5 4 1 h
6 4 2 i
7 4 3 j
data:
df1 <- data.frame(ID = c(1, 1, 2,3,4,4,4))
df2 <- data.frame(ID = c(1,1,1,1,2,3,3,4,4,4,4),
var1 = letters[1:11])
You need a secondary id column. Since you need the first n matches, just group by the id, create an autoincrement id for each group, then join as usual
df1<-data.frame(id=c(1,1,2,3,4,4,4))
d1=sample(seq(as.Date('1999/01/01'), as.Date('2012/01/01'), by="day"),11)
df2<-data.frame(id=c(1,1,1,1,2,3,3,4,4,4,4),d1,d2=d1+sample.int(50,11))
library(dplyr)
df11 <- df1 %>%
group_by(id) %>%
mutate(id2=1:n())%>%
ungroup()
df21 <- df2 %>%
group_by(id) %>%
mutate(id2=1:n())%>%
ungroup()
left_join(df11,df21,by = c("id", "id2"))
# A tibble: 7 x 4
id id2 d1 d2
<dbl> <int> <date> <date>
1 1 1 2009-06-10 2009-06-13
2 1 2 2004-05-28 2004-07-11
3 2 1 2001-08-13 2001-09-06
4 3 1 2005-12-30 2006-01-19
5 4 1 2000-08-06 2000-08-17
6 4 2 2010-09-02 2010-09-10
7 4 3 2007-07-27 2007-09-05
This question may sound similar to others, but I hope it is different enough.
I want to take a specific list of values and count how often they appear in another list of values where non-occurring values are retuned as '0'.
I have a Data Frame (df1) with the following values:
Items <- c('Carrots','Plums','Pineapple','Turkey')
df1<-data.frame(Items)
>df1
Items
1 Carrots
2 Plums
3 Pineapple
4 Turkey
And a second Data Frame (df2) that contains a column called 'Thing':
> head(df2,n=10)
ID Date Thing
1 58150 2012-09-12 Potatoes
2 12357 2012-09-28 Turnips
3 50788 2012-10-04 Oranges
4 66038 2012-10-11 Potatoes
5 18119 2012-10-11 Oranges
6 48349 2012-10-14 Carrots
7 23328 2012-10-16 Peppers
8 66038 2012-10-26 Pineapple
9 32717 2012-10-28 Turnips
10 11345 2012-11-08 Oranges
I know the word 'Turkey' only appears in df1 NOT in df2. I want to return a frequency table or count of the items in df1 that appears in df2 and return '0' for the count of Turkey.
How can I summarize values of on Data Frame column using the values from another? The closest I got was:
df2%>% count (Thing) %>% filter(Thing %in% df1$Items,)
But this return a list of items filtered between df1 and df2 so 'Turkey' gets excluded. So close!
> df2%>% count (Thing) %>% filter(Thing %in% df1$Items,)
# A tibble: 3 x 2
Thing n
<fctr> <int>
1 Carrots 30
2 Pineapple 30
3 Plums 38
I want my output to look like this:
1 Carrots 30
2 Pineapple 30
3 Plums 38
4 Turkey 0
I am newish to R and completely new to dplyr.
I use this sort of thing all the time. I'm sure there's a more savvy way to code it, but it's what I got:
item <- vector()
count <- vector()
items <- list(unique(df1$Items))
for (i in 1:length(items)){
item[i] <- items[i]
count[i] <- sum(df2$Thing == item)
}
df3 <- data.frame(cbind(item, count))
Hope this helps!
Stephen's solution worked with a slight modification, adding the [i] to the item at the end of count[i] line. See below:
item <- vector()
count <- vector()
for (i in 1:length(unique(Items))){
item[i] <- Items[i]
count[i]<- sum(df2$Thing == item[i])
}
df3 <- data.frame(cbind(item, count))
> df3
item count
1 Carrots 30
2 Plums 38
3 Pineapple 30
4 Turkey 0
dplyr drops 0 count rows, and you have the added complication that the possible categories of Thing are different between your two datasets.
If you add the factor levels from df1 to df2, you can use complete from tidyr, which is a common way to add 0 count rows.
I'm adding the factor levels from df1 to df2 using a convenience function from package forcats called fct_expand.
library(dplyr)
library(tidyr)
library(forcats)
df2 %>%
mutate(Thing = fct_expand(Thing, as.character(df1$Item) ) ) %>%
count(Thing) %>%
complete(Thing, fill = list(n = 0) ) %>%
filter(Thing %in% df1$Items,)
A different approach is to aggregate df2 first, to right join with df1 (to pick all rows of df1), and to replace NA by zero.
library(dplyr)
df2 %>%
count(Thing) %>%
right_join(unique(df1), by = c("Thing" = "Items")) %>%
mutate(n = coalesce(n, 0L))
# A tibble: 4 x 2
Thing n
<chr> <int>
1 Carrots 1
2 Plums 0
3 Pineapple 1
4 Turkey 0
Warning message:
Column `Thing`/`Items` joining factors with different levels, coercing to character vector
The same approach in data.table:
library(data.table)
setDT(df2)[, .N, by = Thing][unique(setDT(df1)), on = .(Thing = Items)][is.na(N), N := 0L][]
Thing N
1: Carrots 1
2: Plums 0
3: Pineapple 1
4: Turkey 0
Note that in both implementations unique(df1) is used to avoid unintended duplicate rows after the join.
Edit 2019-06-22:
With development version 1.12.3 data.table has gained a coalesce() function. So, above statement can be written
setDT(df2)[, .N, by = Thing][unique(setDT(df1)), on = .(Thing = Items)][, N := coalesce(N, 0L)][]
If df2 is large and df1 contains only a few Items it might be more efficient to join first and then to aggregate:
library(dplyr)
df2 %>%
right_join(unique(df1), by = c("Thing" = "Items")) %>%
group_by(Thing) %>%
summarise(n = sum(!is.na(ID)))
# A tibble: 4 x 2
Thing n
<chr> <int>
1 Carrots 1
2 Pineapple 1
3 Plums 0
4 Turkey 0
Warning message:
Column `Thing`/`Items` joining factors with different levels, coercing to character vector
The same in data.table syntax:
library(data.table)
setDT(df2)[unique(setDT(df1)), on = .(Thing = Items)][, .(N = sum(!is.na(ID))), by = Thing][]
Thing N
1: Carrots 1
2: Plums 0
3: Pineapple 1
4: Turkey 0
Edit 2019-06-22: Above can be written more concisely by aggregating in a join:
setDT(df2)[setDT(df1), on = .(Thing = Items), .N, by = .EACHI]