dplyr: filter a value by existing in two conditions [duplicate] - r

I have a R dataset x as below:
ID Month
1 1 Jan
2 3 Jan
3 4 Jan
4 6 Jan
5 6 Jan
6 9 Jan
7 2 Feb
8 4 Feb
9 6 Feb
10 8 Feb
11 9 Feb
12 10 Feb
13 1 Mar
14 3 Mar
15 4 Mar
16 6 Mar
17 7 Mar
18 9 Mar
19 2 Apr
20 4 Apr
21 6 Apr
22 7 Apr
23 8 Apr
24 10 Apr
25 1 May
26 2 May
27 4 May
28 6 May
29 7 May
30 8 May
31 2 Jun
32 4 Jun
33 5 Jun
34 6 Jun
35 9 Jun
36 10 Jun
I am trying to figure out a R function/code to identify all IDs that exist atleast once in every month.
In the above case, ID 4 & 6 are present in all months.
Thanks

First, split the df$ID by Month and use intersect to find elements common in each sub-group.
Reduce(intersect, split(df$ID, df$Month))
#[1] 4 6
If you want to subset the corresponding data.frame, do
df[df$ID %in% Reduce(intersect, split(df$ID, df$Month)),]

We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', get the row index (.I) where the number of unique 'Months' are equal to the number of unique 'Months' in the whole dataset and subset the data based on this
library(data.table)
setDT(df1)[df1[, .I[uniqueN(Month) == uniqueN(df1$Month)], ID]$V1]
# ID Month
# 1: 4 Jan
# 2: 4 Feb
# 3: 4 Mar
# 4: 4 Apr
# 5: 4 May
# 6: 4 Jun
# 7: 6 Jan
# 8: 6 Jan
# 9: 6 Feb
#10: 6 Mar
#11: 6 Apr
#12: 6 May
#13: 6 Jun
To extract the 'ID's
setDT(df1)[, ID[uniqueN(Month) == uniqueN(df1$Month)], ID]$V1
#[1] 4 6
Or with base R
1) Using table with rowSums
v1 <- rowSums(table(df1) > 0)
names(v1)[v1==max(v1)]
#[1] "4" "6"
This info can be used for subsetting the data
subset(df1, ID %in% names(v1)[v1 == max(v1)])
2) Using tapply
lst <- with(df1, tapply(Month, ID, FUN = unique))
names(which(lengths(lst) == length(unique(df1$Month))))
#[1] "4" "6"
Or using dplyr
library(dplyr)
df1 %>%
group_by(ID) %>%
filter(n_distinct(Month)== n_distinct(df1$Month)) %>%
.$ID %>%
unique
#[1] 4 6
or if we need to get the rows
df1 %>%
group_by(ID) %>%
filter(n_distinct(Month)== n_distinct(df1$Month))
# A tibble: 13 x 2
# Groups: ID [2]
# ID Month
# <int> <chr>
# 1 4 Jan
# 2 6 Jan
# 3 6 Jan
# 4 4 Feb
# 5 6 Feb
# 6 4 Mar
# 7 6 Mar
# 8 4 Apr
# 9 6 Apr
#10 4 May
#11 6 May
#12 4 Jun
#13 6 Jun

An alternative solution using dplyr and purrr:
tib %>%
dplyr::group_by(Month) %>%
dplyr::group_split(.keep = F) %>%
purrr::reduce(intersect)
# A tibble: 2 x 1
# ID
# <dbl>
# 1 4
# 2 6
returns the desired IDs, where tib is a tibble containing the input data.

Related

Extracting strings from links using regex in R

I have a list of url links and i want to extract one of the strings and save them in another variable. The sample data is below:
sample<- c("http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr01f2009.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr02f2001.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr03f2002.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr04f2004.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr05f2005.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr06f2018.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr07f2016.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr08f2015.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr09f2020.pdf",
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr10f2014.pdf")
sample
[1] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr01f2009.pdf"
[2] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr02f2001.pdf"
[3] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr03f2002.pdf"
[4] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr04f2004.pdf"
[5] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr05f2005.pdf"
[6] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr06f2018.pdf"
[7] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr07f2016.pdf"
[8] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr08f2015.pdf"
[9] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr09f2020.pdf"
[10] "http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr10f2014.pdf"
I want to extract week and year using regex.
week year
1 1 2009
2 2 2001
3 3 2002
4 4 2004
5 5 2005
6 6 2018
7 7 2016
8 8 2015
9 9 2020
10 10 2014
You could use str_match to capture numbers after 'owgr' and 'f' :
library(stringr)
str_match(sample, 'owgr(\\d+)f(\\d+)')[, -1]
You can convert this to dataframe, change class to numeric and assign column names.
setNames(type.convert(data.frame(
str_match(sample, 'owgr(\\d+)f(\\d+)')[, -1])), c('year', 'week'))
# year week
#1 1 2009
#2 2 2001
#3 3 2002
#4 4 2004
#5 5 2005
#6 6 2018
#7 7 2016
#8 8 2015
#9 9 2020
#10 10 2014
Another way could be to extract all the numbers from last part of sample. We can get the last part with basename.
str_extract_all(basename(sample), '\\d+', simplify = TRUE)
Another way you can try
library(dplyr)
library(stringr)
df <- data.frame(sample)
df2 <- df %>%
transmute(year = str_extract(sample, "(?<=wgr)\\d{1,2}(?=f)"), week = str_extract(sample, "(?<=f)\\d{4}(?=\\.pdf)"))
# year week
# 1 1 2009
# 2 2 2001
# 3 3 2002
# 4 4 2004
# 5 5 2005
# 6 6 2018
# 7 7 2016
# 8 8 2015
# 9 9 2020
# 10 10 2014
You could use {unglue} :
library(unglue)
unglue_data(
sample,
"http://dps.endavadigital.net/owgr/doc/content/archive/2009/owgr{week}f{year}.pdf")
#> week year
#> 1 01 2009
#> 2 02 2001
#> 3 03 2002
#> 4 04 2004
#> 5 05 2005
#> 6 06 2018
#> 7 07 2016
#> 8 08 2015
#> 9 09 2020
#> 10 10 2014

Extract elements common in all column groups

I have a R dataset x as below:
ID Month
1 1 Jan
2 3 Jan
3 4 Jan
4 6 Jan
5 6 Jan
6 9 Jan
7 2 Feb
8 4 Feb
9 6 Feb
10 8 Feb
11 9 Feb
12 10 Feb
13 1 Mar
14 3 Mar
15 4 Mar
16 6 Mar
17 7 Mar
18 9 Mar
19 2 Apr
20 4 Apr
21 6 Apr
22 7 Apr
23 8 Apr
24 10 Apr
25 1 May
26 2 May
27 4 May
28 6 May
29 7 May
30 8 May
31 2 Jun
32 4 Jun
33 5 Jun
34 6 Jun
35 9 Jun
36 10 Jun
I am trying to figure out a R function/code to identify all IDs that exist atleast once in every month.
In the above case, ID 4 & 6 are present in all months.
Thanks
First, split the df$ID by Month and use intersect to find elements common in each sub-group.
Reduce(intersect, split(df$ID, df$Month))
#[1] 4 6
If you want to subset the corresponding data.frame, do
df[df$ID %in% Reduce(intersect, split(df$ID, df$Month)),]
We can use data.table. Convert the 'data.frame' to 'data.table' (setDT(df1)), grouped by 'ID', get the row index (.I) where the number of unique 'Months' are equal to the number of unique 'Months' in the whole dataset and subset the data based on this
library(data.table)
setDT(df1)[df1[, .I[uniqueN(Month) == uniqueN(df1$Month)], ID]$V1]
# ID Month
# 1: 4 Jan
# 2: 4 Feb
# 3: 4 Mar
# 4: 4 Apr
# 5: 4 May
# 6: 4 Jun
# 7: 6 Jan
# 8: 6 Jan
# 9: 6 Feb
#10: 6 Mar
#11: 6 Apr
#12: 6 May
#13: 6 Jun
To extract the 'ID's
setDT(df1)[, ID[uniqueN(Month) == uniqueN(df1$Month)], ID]$V1
#[1] 4 6
Or with base R
1) Using table with rowSums
v1 <- rowSums(table(df1) > 0)
names(v1)[v1==max(v1)]
#[1] "4" "6"
This info can be used for subsetting the data
subset(df1, ID %in% names(v1)[v1 == max(v1)])
2) Using tapply
lst <- with(df1, tapply(Month, ID, FUN = unique))
names(which(lengths(lst) == length(unique(df1$Month))))
#[1] "4" "6"
Or using dplyr
library(dplyr)
df1 %>%
group_by(ID) %>%
filter(n_distinct(Month)== n_distinct(df1$Month)) %>%
.$ID %>%
unique
#[1] 4 6
or if we need to get the rows
df1 %>%
group_by(ID) %>%
filter(n_distinct(Month)== n_distinct(df1$Month))
# A tibble: 13 x 2
# Groups: ID [2]
# ID Month
# <int> <chr>
# 1 4 Jan
# 2 6 Jan
# 3 6 Jan
# 4 4 Feb
# 5 6 Feb
# 6 4 Mar
# 7 6 Mar
# 8 4 Apr
# 9 6 Apr
#10 4 May
#11 6 May
#12 4 Jun
#13 6 Jun
An alternative solution using dplyr and purrr:
tib %>%
dplyr::group_by(Month) %>%
dplyr::group_split(.keep = F) %>%
purrr::reduce(intersect)
# A tibble: 2 x 1
# ID
# <dbl>
# 1 4
# 2 6
returns the desired IDs, where tib is a tibble containing the input data.

how to replace missing values with previous year's binned mean

I have a data frame as below
p1_bin and f1_bin are calculated by cut function by me with
Bins <- function(x) cut(x, breaks = c(0, seq(1, 1000, by = 5)), labels = 1:200)
binned <- as.data.frame (sapply(df[,-1], Bins))
colnames(binned) <- paste("Bin", colnames(binned), sep = "_")
df<- cbind(df, binned)
Now how to calculate mean/avg for previous two years and replace in NA values with in that bin
for example : at row-5 value is NA for p1 and f1 is 30 with corresponding bin 7.. now replace NA with previous 2 years mean for same bin (7) ,i.e
df
ID year p1 f1 Bin_p1 Bin_f1
1 2013 20 30 5 7
2 2013 24 29 5 7
3 2014 10 16 2 3
4 2014 11 17 2 3
5 2015 NA 30 NA 7
6 2016 10 NA 2 NA
df1
ID year p1 f1 Bin_p1 Bin_f1
1 2013 20 30 5 7
2 2013 24 29 5 7
3 2014 10 16 2 3
4 2014 11 17 2 3
5 2015 **22** 30 NA 7
6 2016 10 **16.5** 2 NA
Thanks in advance
I believe the following code produces the desired output. There's probably a much more elegant way than using mean(rev(lag(f1))[1:2]) to get the average of the last two values of f1 but this should do the trick anyway.
library(dplyr)
df %>%
arrange(year) %>%
mutate_at(c("p1", "f1"), "as.double") %>%
group_by(Bin_p1) %>%
mutate(f1 = ifelse(is.na(f1), mean(rev(lag(f1))[1:2]), f1)) %>%
group_by(Bin_f1) %>%
mutate(p1 = ifelse(is.na(p1), mean(rev(lag(p1))[1:2]), p1)) %>%
ungroup
and the output is:
# A tibble: 6 x 6
ID year p1 f1 Bin_p1 Bin_f1
<int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 1 2013 20 30.0 5 7
2 2 2013 24 29.0 5 7
3 3 2014 10 16.0 2 3
4 4 2014 11 17.0 2 3
5 5 2015 22 30.0 NA 7
6 6 2016 10 16.5 2 NA

Merge 2 resulting vectors into 1 data frame using R

I have a df like this
Month <- c('JAN','JAN','JAN','JAN','FEB','FEB','MAR','APR','MAY','MAY')
Category <- c('A','A','B','C','A','E','B','D','E','F')
Year <- c(2014,2015,2015,2015,2014,2013,2015,2014,2015,2013)
Number_Combinations <- c(3,2,3,4,1,3,6,5,1,1)
df <- data.frame(Month ,Category,Year,Number_Combinations)
df
Month Category Year Number_Combinations
1 JAN A 2014 3
2 JAN A 2015 2
3 JAN B 2015 3
4 JAN C 2015 4
5 FEB A 2014 1
6 FEB E 2013 3
7 MAR B 2015 6
8 APR D 2014 5
9 MAY E 2015 1
10 MAY F 2013 1
I have another df that I got from the above dataframe with a condition
df1 <- subset(df,Number_Combinations > 2)
df1
Month Category Year Number_Combinations
1 JAN A 2014 3
3 JAN B 2015 3
4 JAN C 2015 4
6 FEB E 2013 3
7 MAR B 2015 6
8 APR D 2014 5
Now I want to create a table reporting the month, the total number of rows for the month in df and the total number of for the month in df1
Desired Output would be
Month Number_Month_df Number_Month_df1
1 JAN 4 3
2 FEB 2 1
3 MAR 1 1
4 APR 1 1
5 MAY 2 0
While I used table(df) and table(df1) and tried merging but not getting the desired result. Could someone please help me in getting the above dataframe?
We get the table of the 'Month' column from both 'df' and 'df1', convert to 'data.frame' (as.data.frame), merge by the 'Var1', and change the column names accordingly.
res <- merge(as.data.frame(table(df$Month)),
as.data.frame(table(df1$Month)), by='Var1')
colnames(res) <- c('Month', 'Number_Month_df', 'Number_Month_df1')
res <- data.frame(Number_Month_df=sort(table(df$Month),T),
Number_Month_df1=sort(table(df1$Month),T))
res$Month <- rownames(res)

data standardization for all group data.frame in R

I have a dataset as below
Date <- rep(c("Jan", "Feb"), 3)[1:5]
Group <- c(rep(letters[1:2],each=2),"c")
value <- sample(1:10,5)
data <- data.frame(Date, Group, value)
> data
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
As you can observed, for group c it do not have data on Date=Feb.
How can i make a dataset such that
> DATA
Date Group value
1 Jan a 2
2 Feb a 7
3 Jan b 3
4 Feb b 9
5 Jan c 1
6 Feb c 0
I have added last row such that value for group c in feb is 0.
Thanks
With base R you can use xtabs wrapped in as.data.frame:
as.data.frame(xtabs(formula = value ~ Date + Group, data = data))
# Date Group Freq
#1 Feb a 8
#2 Jan a 6
#3 Feb b 4
#4 Jan b 1
#5 Feb c 0
#6 Jan c 10
Using merge:
#get all combinations of 2 columns
all.comb <- expand.grid(unique(data$Date),unique(data$Group))
colnames(all.comb) <- c("Date","Group")
#merge with all.x=TRUE to keep nonmatched rows
res <- merge(all.comb,data,all.x=TRUE)
#convert NA to 0
res$value[is.na(res$value)] <- 0
#result
res
# Date Group value
# 1 Feb a 3
# 2 Feb b 4
# 3 Feb c 0
# 4 Jan a 5
# 5 Jan b 7
# 6 Jan c 10
Using reshape2
library(reshape2)
melt(dcast(data, Date~Group, value.var="value",fill=0), id.var="Date") #values differ as there was no set.seed()
# Date variable value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5
Or using dplyr
library(dplyr)
library(tidyr)
data%>%
spread(Group, value, fill=0) %>%
gather(Group, value, a:c)
# Date Group value
#1 Feb a 1
#2 Jan a 10
#3 Feb b 7
#4 Jan b 4
#5 Feb c 0
#6 Jan c 5

Resources