ungrouping dataframe in r which is grouped by rows - r

I have this dataframe
Source: local data frame [159 x 2]
Groups: <by row>
# A tibble: 159 × 2
session_id requestId
* <int> <list>
1 1105 <int [3]>
2 1107 <int [2]>
3 1108 <int [6]>
4 1109 <int [1]>
5 1110 <int [6]>
6 1111 <int [8]>
7 1112 <int [4]>
8 1114 <int [8]>
9 1117 <int [7]>
10 1118 <int [4]>
# ... with 149 more rows
I dont know how to ungroup it or its not working as it is grouped by rows not by some variable ..
I want my output look like something in given pattern/format
# A tibble: 342 × 2
session_id requestId
<int> <dbl>
1 1105 10
2 1105 3
3 1107 13
4 1107 13
5 1108 4
6 1108 6
7 1109 12
8 1109 5
9 1110 6
10 1110 10
I dont know how to do it ,must be simple if known..Thanks for helping
Edit :-
structure(list(session_id = c(1105L, 1107L, 1108L, 1109L, 1110L,
1111L, 1112L, 1114L, 1117L, 1118L), requestId = list(c(8L, 14L,
20L), c(7L, 14L), c(1L, 7L, 8L, 20L, 16L, 17L), 8L, c(1L, 16L,
17L, 8L, 14L, 20L), c(1L, 7L, 8L, 20L, 4L, 11L, 13L, 14L), c(4L,
11L, 13L, 14L), c(6L, 8L, 14L, 2L, 4L, 10L, 15L, 18L), c(4L,
5L, 10L, 16L, 2L, 15L, 18L), c(20L, 1L, 7L, 8L))), .Names = c("session_id",
"requestId"), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))

Here's a possible starting point, although it's still missing a reproducible example. Can be edited when we get one:
tlengths <- sapply( tbl$requestId, length)
#Error in lapply(X = X, FUN = FUN, ...) : object 'tbl' not found
sessions <- mapply(rep, sessionId, tlengths)
#Error in mapply(rep, sessionId, tlengths) : object 'sessionId' not found
newtbl <- tibble(session_id=sessions, requestId=unlist(tbl$requestId))
#Error in tibble(session_id = sessions, requestId = unlist(tbl$requestId)) :
# could not find function "tibble"
library(tidyverse)

As -pkumar suggested in comment , I did this using tidyverse's unnest
df = structure(list(session_id = c(1105L, 1107L, 1108L, 1109L, 1110L,
1111L, 1112L, 1114L, 1117L, 1118L), requestId = list(c(8L, 14L,
20L), c(7L, 14L), c(1L, 7L, 8L, 20L, 16L, 17L), 8L, c(1L, 16L,
17L, 8L, 14L, 20L), c(1L, 7L, 8L, 20L, 4L, 11L, 13L, 14L), c(4L,
11L, 13L, 14L), c(6L, 8L, 14L, 2L, 4L, 10L, 15L, 18L), c(4L,
5L, 10L, 16L, 2L, 15L, 18L), c(20L, 1L, 7L, 8L))), .Names = c("session_id",
"requestId"), row.names = c(NA, -10L), class = c("tbl_df", "tbl",
"data.frame"))
library('tidyverse')
unnest(df)

Related

Finding the 3 coldest consecutive months

I am trying to write a code that finds the 3 consecutives months that are the coldest.
For now I have written a code for the 3 first months (1,2,3) but then it should be applied to (4,5,6), (7,8,9), (10,11,12), (2,3,4), (5,6,7), (8,9,10), (11,12,1), (3,4,5), (6,7,8), (9,10,11) and (12,1,2) which are all the possible combinations of 3 consecutives months.
The code I wrote is here :
cold <- data_example %>%
group_by(Site) %>%
filter(Month %in% c(1,2,3)) %>%
mutate(mean_temperature = mean(t_q)) %>%
dplyr::select(-c(t_q,Month)) %>%
distinct(Site, mean_temperature)
average_temp_month_1_2_3 <- cold$mean_temperature
Then I replaced the c(1,2,3) by all possiblities, I have created a new column for each output.
I end up with a dataset with row corresponding to Site and columns are all the possibilities of 3 consecutive months.
After I took the min value for each row using the function apply() and min() and it gives me the coldest quarter for each Site.
I am looking for a way to generalize it, like creating a loop on the possiblities.
The structure of data_example is as follow :
structure(list(Site = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 25L, 25L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 26L, 26L), Month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L), t_q = c(9.67754848470332, -6.74555496540183,
5.67969761460384, 12.537207581471, -9.4899105618945, 21.0747672424502,
15.2643039243614, -3.62839910494421, 11.3919028351455, 1.69988257436554,
4.22015024307287, 11.7045830784212, 8.91437673833493, 0.579081429509138,
-10.8207481229903, 7.05356868592628, 13.0911580912516, 17.2032089167605,
-2.47642708849114, -11.2105599344486, 33.986736305027, 17.8578689773214,
-14.9114468266335, 14.4681380389141, 0.568074240873411, 7.65458408777801,
1.91368344556659, 6.01571556896127, 11.4858297513536, 2.2608458985328,
-2.08200762781776, 12.1540989284163, 20.9941815285413, 0.375777604316208,
-2.7137027317614, -6.17690210400591, 11.2549857164403, 17.447156776654,
-6.96565197389579, -5.41542361226991, 11.1680111873065, 16.2266522778922,
-11.4503938582433, 5.93300314835716, -18.2818398656237, 16.2930210946949,
9.80219192652316, -0.48237356523527, 7.72680942503686, 5.84113084181759,
9.66129413490096, -4.53018262186904, 7.42187509892118, 9.2559478576895,
8.25120948667013, 8.18182063263247, 16.3703081943971, 19.5469951420341,
3.71888263185773, -0.150179891749435, 1.32057298670562, -5.63556532224354,
21.3918542474341, 4.58752188336035, 5.49430262894033, 5.99587512047837,
-3.76459024109216, -8.53522098071824, 8.01805680562232, 26.2227490426066,
8.90822434139878, 5.04259034084471, 6.89740304247746, 11.9484584922927,
-11.5085102739471, 30.4526759119379, 21.878533782357, -5.39936677076962,
-9.83965056853816, 19.3083455159472, 7.90653548036154, 3.11876660277767,
-8.85027083180008, -9.9225496831988, 5.97307112581907, -2.83528336599284,
-2.75758002814396, 4.68388181004449, 6.61649031537118, -6.65988084338133,
-0.981075313384259, 5.84898952305179, -5.20962191660178, 0.416662319713158,
-10.5336993269853, 19.5350642296553, 26.9696625385792, 15.3291059661081,
15.0799591208354, 13.2310653499033, 7.2053382722482, -7.87288386491102,
20.8083797469715, 6.16664220270041, 8.3360949793043, -14.4000921795463,
-10.5503025782944, 14.3185205291177, 5.83802399796341, 2.49660818997943,
15.7399297014092, -0.834086173817971, 12.4883230222372, 6.73548467376379,
7.7988835803825, -5.13583355913738, 7.51054162811707, 11.6610602814336,
-11.8864185954223, 4.2704440943851)), row.names = c(NA, -120L
), groups = structure(list(Site = c(4L, 5L, 13L, 14L, 15L, 16L,
17L, 18L, 25L, 26L), .rows = structure(list(1:12, 13:24, 25:36,
37:48, 49:60, 61:72, 73:84, 85:96, 97:108, 109:120), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You can use raster::movingFun to do a moving average with circular data, then use slice_min to get the minimum value per group.
library(dplyr)
circ <- function(x, by) ifelse(x%%by == 0, by, x%%by)
df %>%
group_by(Site) %>%
mutate(rolmean = raster::movingFun(t_q, n = 3, fun = mean, circular = TRUE)) %>%
slice_min(rolmean) %>%
mutate(coldest = toString(circ(c(Month-1, Month, Month+1), 12)))
output
# A tibble: 10 × 5
# Groups: Site [10]
Site Month t_q rolmean coldest
<int> <int> <dbl> <dbl> <chr>
1 4 2 -6.75 2.87 1, 2, 3
2 5 3 -10.8 -1.06 2, 3, 4
3 13 11 -2.71 -2.84 10, 11, 12
4 14 8 5.93 -7.93 7, 8, 9
5 15 3 9.66 3.66 2, 3, 4
6 16 7 -3.76 -2.10 6, 7, 8
7 17 11 -8.85 -5.22 10, 11, 12
8 18 10 0.417 -5.11 9, 10, 11
9 25 10 -14.4 -5.54 9, 10, 11
10 26 12 4.27 -0.593 11, 12, 1
Using which.min in aggregate on a moving average window.
aggregate(t_q ~ Site, dat, \(s) {
win <- 3 ## window length
sq <- Map(seq, 1:(length(s) - win + 1), win:length(s))
toString(sq[[which.min(sapply(sq, \(sq) mean(s[sq])))]])
})
# Site t_q
# 1 4 1, 2, 3
# 2 5 2, 3, 4
# 3 13 10, 11, 12
# 4 14 7, 8, 9
# 5 15 2, 3, 4
# 6 16 6, 7, 8
# 7 17 10, 11, 12
# 8 18 9, 10, 11
# 9 25 9, 10, 11
# 10 26 10, 11, 12

Explicit factor NAs in a data frame

I have the following data frame with ages binned in ranges of 5 years and the frequency of a condition happening in males/females. The problem is that there were no occurrences in either gender for example in the range 15-20.
structure(list(age = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), .Label = c("[0,5]",
"(5,10]", "(10,15]", "(15,20]", "(20,25]", "(25,30]", "(30,35]",
"(35,40]", "(40,45]", "(45,50]", "(50,55]", "(55,60]", "(60,65]",
"(65,70]", "(70,75]", "(75,80]", "(80,85]", "(85,90]", "(90,95]",
"(95,100]"), class = "factor"), male = c(2L, 1L, 1L, 4L, 8L,
9L, 20L, 33L, 49L, 104L, 112L, 176L, 159L, 140L, 94L, 72L, 32L,
6L, 2L), female = c(1L, 1L, NA, 7L, 7L, 4L, 23L, 39L, 44L, 74L,
94L, 111L, 124L, 129L, 110L, 92L, 76L, 30L, 7L)), row.names = c(NA,
-19L), groups = structure(list(age = structure(c(1L, 2L, 3L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L), .Label = c("[0,5]", "(5,10]", "(10,15]", "(15,20]",
"(20,25]", "(25,30]", "(30,35]", "(35,40]", "(40,45]", "(45,50]",
"(50,55]", "(55,60]", "(60,65]", "(65,70]", "(70,75]", "(75,80]",
"(80,85]", "(85,90]", "(90,95]", "(95,100]"), class = "factor"),
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -19L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
If I check the levels it properly shows all levels.
what I would want is a data frame where all ranges of ages show up and when they don't exist substitute then by 0.
You can use complete :
library(dplyr)
library(tidyr)
df %>%
ungroup %>%
complete(age, fill = list(male = 0, female = 0))
# age male female
# <fct> <dbl> <dbl>
# 1 [0,5] 2 1
# 2 (5,10] 1 1
# 3 (10,15] 1 0
# 4 (15,20] 0 0
# 5 (20,25] 4 7
# 6 (25,30] 8 7
# 7 (30,35] 9 4
# 8 (35,40] 20 23
# 9 (40,45] 33 39
#10 (45,50] 49 44
#11 (50,55] 104 74
#12 (55,60] 112 94
#13 (60,65] 176 111
#14 (65,70] 159 124
#15 (70,75] 140 129
#16 (75,80] 94 110
#17 (80,85] 72 92
#18 (85,90] 32 76
#19 (90,95] 6 30
#20 (95,100] 2 7

Replace multiple characters from multiple columns in R

Given a dataframe as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(2L, 16L, 23L, 24L, 4L, 6L, 7L, 9L, 10L, 12L,
14L, 17L, 1L, 22L, 3L, 5L, 8L, 11L, 13L, 15L, 18L, 19L, 20L,
21L), .Label = c("-", "--", "1058.18", "1455.6", "1539.01",
"1867.07", "2036.92", "2102.23", "2372.84", "2693.96", "2769.65",
"2973.04", "3146.88", "3227.23", "3604.71", "365.07", "3678.01",
"4043.18", "4438.55", "4860.76", "5360.94", "555.51", "653.19",
"980.72"), class = "factor"), c = structure(c(2L, 6L, 10L,
11L, 13L, 15L, 16L, 18L, 20L, 22L, 24L, 7L, 1L, 9L, 12L,
14L, 17L, 19L, 21L, 23L, 3L, 4L, 5L, 8L), .Label = c("-",
"--", "1092.73", "1222.48", "1409.07", "158.18", "1748.44",
"2179.42", "227.68", "268.53", "331.81", "366.95", "434.19",
"486.41", "538.49", "606.62", "614.75", "651.46", "729.44",
"736.55", "836.46", "890.81", "929.72", "981.65"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
How could I replace -- and - in only columns a and b with NA? Thanks.
You can use :
cols <- c('a', 'b')
df[cols][df[cols] == '--' | df[cols] == '-'] <- NA
Or using dplyr :
library(dplyr)
df %>% mutate(across(c(a, b), ~replace(., . %in% c('--', '-'), NA)))
I think it's better to try to avoid the data being read in like this in the first place, but if you need to correct it after, you can try using the na.strings argument in type.convert. Notice that it's na.strings with an "s" -- it's plural, so more than one value can be used to represent NA values.
df[c("a", "b")] <- lapply(df[c("a", "b")], type.convert, na.strings = c("--", "-"))
str(df)
# 'data.frame': 24 obs. of 4 variables:
# $ date: Factor w/ 24 levels "2010Y1-01m","2010Y1-02m",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ a : num NA 7363 8238 8831 9317 ...
# $ b : num NA 365 653 981 1456 ...
# $ c : Factor w/ 24 levels "-","--","1092.73",..: 2 6 10 11 13 15 16 18 20 22 ...
head(df)
# date a b c
# 1 2010Y1-01m NA NA --
# 2 2010Y1-02m 7363.45 365.07 158.18
# 3 2010Y1-03m 8237.71 653.19 268.53
# 4 2010Y1-04m 8830.99 980.72 331.81
# 5 2010Y1-05m 9316.56 1455.60 434.19
# 6 2010Y1-06m 9795.77 1867.07 538.49
Note that in this particular case, you could also use the side effect of as.numeric(as.character(...)) converting anything that can't be coerced to numeric to NA, but keep in mind that you will get a warning for each column that you use this approach on.
lapply(df[c("a", "b")], function(x) as.numeric(as.character(x)))

How to use purrr functions inside dplyr's select_if

I'm trying to find the shortest possible dplyr-purr combination.
Can I reduce the following statement which combines select_if() and map_df() ?
training.set.imputed %>%
select_if(~sum(is.na(.))>0) %>% map_df(~sum(is.na(.)))
I tried this:
training.set.imputed %>%
select_if(~sum(is.na(.))>0, .funs = ~sum(is.na(.)))
which throws this error:
Error: nm must be NULL or a character vector the same length as x
What does this mean?
Any ideas how to form the .funs term?
The .funs argument in select_if requires a renaming function, and not a mutating function, so you can do something like this with it, but you can't mutate the variable values:
tibble(blah = 1:2, bleh = 3:4, bluh = c(NA, NA)) %>%
select_if(~ sum(is.na(.x)) > 0, .funs = toupper)
#### OUTPUT ####
# A tibble: 2 x 1
BLUH
<lgl>
1 NA
2 NA
If you insist on using a combination of purrr and dplyr, then this is probably your best bet (Edit: I just noticed that G. Grothendieck gave this answer, but I'll include it anyway for the sake of completeness.):
df %>%
map_df(~ sum(is.na(.))) %>%
select_if(~ . > 0)
#### OUTPUT ####
# A tibble: 1 x 2
b d
<int> <int>
1 4 1
However, you can simplify it a little by just using dplyr's summarize_if:
df %>%
summarise_if(anyNA, ~ sum(is.na(.)))
#### OUTPUT ####
# A tibble: 1 x 2
b d
<int> <int>
1 4 1
Since you're really just after column sums, base R might offer the most concise option:
colSums(is.na(df)) %>%
.[. > 0]
#### OUTPUT ####
b d
4 1
Data
structure(list(a = c(2L, 2L, 5L, 10L, 10L, 18L, 18L, 19L, 11L,
14L, 12L, 10L, 4L, 16L, 5L, 5L, 11L, 2L, 14L, 7L), b = c(10L,
20L, 16L, NA, 6L, 1L, 11L, 12L, 12L, 12L, 8L, NA, NA, 8L, 11L,
19L, 8L, 9L, NA, 19L), c = c(11L, 11L, 20L, 8L, 15L, 4L, 17L,
4L, 4L, 11L, 20L, 11L, 6L, 12L, 17L, 7L, 14L, 18L, 15L, 19L),
d = c(19L, 16L, 17L, 14L, 8L, 19L, 7L, 6L, 6L, 13L, 7L, 19L,
11L, 17L, NA, 10L, 3L, 3L, 3L, 2L), e = c(12L, 17L, 14L,
5L, 8L, 19L, 8L, 3L, 17L, 1L, 2L, 6L, 5L, 17L, 14L, 5L, 8L,
2L, 8L, 2L)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
I assume you want the number of NAs in each column keeping only columns that have at least 1 NA.
!) This avoids the code duplication and does not give an error. First calculate the number of NAs in each column and then pick out the columns that are greater than 0.
# test input - BOD comes with R
BOD[1,2] <- NA
BOD %>%
map_df(~ sum(is.na(.))) %>%
select_if(~ . > 0)
giving:
# A tibble: 1 x 1
demand
<int>
1 1
2) This first selects out those columns with at least one NA and then finds the number of NAs in those columns giving the same result:
BOD %>%
select_if(anyNA) %>%
map_df(~ sum(is.na(.)))

delete observations by days in R

My dataset has the next structure
df=structure(list(Data = structure(c(12L, 13L, 14L, 15L, 16L, 17L,
18L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), .Label = c("01.01.2018",
"02.01.2018", "03.01.2018", "04.01.2018", "05.01.2018", "06.01.2018",
"07.01.2018", "12.02.2018", "13.02.2018", "14.02.2018", "15.02.2018",
"25.12.2017", "26.12.2017", "27.12.2017", "28.12.2017", "29.12.2017",
"30.12.2017", "31.12.2017"), class = "factor"), sku = 1:18, metric = c(100L,
210L, 320L, 430L, 540L, 650L, 760L, 870L, 980L, 1090L, 1200L,
1310L, 1420L, 1530L, 1640L, 1750L, 1860L, 1970L), action = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), .Names = c("Data", "sku", "metric", "action"), class = "data.frame", row.names = c(NA,
-18L))
I need to delete observations that have certain dates.
But in this dataset there is action variable. The action column has only two values 0 and 1.
Observations on these certain dates should be deleted only for the zero category of action.
these dates are presented in a separate datase.
datedata=structure(list(Data = structure(c(18L, 19L, 20L, 21L, 22L, 5L,
7L, 9L, 11L, 13L, 15L, 17L, 23L, 1L, 2L, 3L, 4L, 6L, 8L, 10L,
12L, 14L, 16L), .Label = c("01.05.2018", "02.05.2018", "03.05.2018",
"04.05.2018", "05.03.2018", "05.05.2018", "06.03.2018", "06.05.2018",
"07.03.2018", "07.05.2018", "08.03.2018", "08.05.2018", "09.03.2018",
"09.05.2018", "10.03.2018", "10.05.2018", "11.03.2018", "21.02.2018",
"22.02.2018", "23.02.2018", "24.02.2018", "25.02.2018", "30.04.2018"
), class = "factor")), .Names = "Data", class = "data.frame", row.names = c(NA,
-23L))
how can i do it?
A solution is to use dplyr::filter as:
library(dplyr)
library(lubridate)
df %>% mutate(Data = dmy(Data)) %>%
filter(action==1 | (action==0 & !(Data %in% dmy(datedata$Data))))
# Data sku metric action
# 1 2017-12-25 1 100 0
# 2 2017-12-26 2 210 0
# 3 2017-12-27 3 320 0
# 4 2017-12-28 4 430 0
# 5 2017-12-29 5 540 0
# 6 2017-12-30 6 650 0
# 7 2017-12-31 7 760 0
# 8 2018-01-01 8 870 0
# 9 2018-01-02 9 980 1
# 10 2018-01-03 10 1090 1
# 11 2018-01-04 11 1200 1
# 12 2018-01-05 12 1310 1
# 13 2018-01-06 13 1420 1
# 14 2018-01-07 14 1530 1
# 15 2018-02-12 15 1640 1
# 16 2018-02-13 16 1750 1
# 17 2018-02-14 17 1860 1
# 18 2018-02-15 18 1970 1
I guess this will work. Fist use match to see weather there is a match in the day of df and the day in datedata, then filter it
library (dplyr)
df <- df %>% mutate (Data.flag = match(Data,datedata$Data)) %>%
filter(!is.na(Data.flag) & action == 0)

Resources