Get the number of days from the date column [duplicate] - r

This question already has answers here:
Find out the number of days of a month in R
(15 answers)
Closed 3 years ago.
I have column - Month, and there are few months, I need to find out the number of day in that month.
I need to find the number of days in a month considering the leap year and that the month can go to June 2019, or current month. The logic should be applied on the column.
MONTH no_of_days
Jan,2017
Feb,2017
Mar,2017
Apr,2017
May,2017
Jun,2017
Jul,2017
Aug,2017
Sep,2017
Oct,2017
Nov,2017
Dec,2017
structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L,
11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L,
4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017",
"Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017",
"Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017",
"Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017",
"Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))

You need to first get the column MONTH to Date and then use any of the function shown here to get number of days in a month.
as.Date(paste0(1, df$MONTH), "%d%b,%Y")
#[1] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01" "2017-06-01"
# "2017-07-01" "2017-08-01" "2017-09-01" "2017-10-01" "2017-11-01" "2017-12-01"
Hmisc::monthDays(as.Date(paste0(1, df$MONTH), "%d%b,%Y"))
#[1] 31 28 31 30 31 30 31 31 30 31 30 31
data
df <- structure(list(MONTH = structure(c(5L, 4L, 8L, 1L, 9L, 7L, 6L,
2L, 12L, 11L, 10L, 3L), .Label = c("Apr,2017", "Aug,2017", "Dec,2017",
"Feb,2017", "Jan,2017", "Jul,2017", "Jun,2017", "Mar,2017", "May,2017",
"Nov,2017", "Oct,2017", "Sep,2017"), class = "factor")), class =
"data.frame", row.names = c(NA, -12L))

You'll need to wrangle your MONTH column into a date object, then you can use lubridate::days_in_month():
if (!require(lubridate)) install.packages('lubridate')
if (!require(dplyr)) install.packages('dplyr')
library(lubridate)
library(dplyr)
dat %>%
mutate(MONTH = paste("01", MONTH, sep = ",") %>% dmy(),
no_of_days = days_in_month(MONTH))
# A tibble: 12 x 2
MONTH no_of_days
<date> <int>
1 2017-01-01 31
2 2017-02-01 28
3 2017-03-01 31
4 2017-04-01 30
5 2017-05-01 31
6 2017-06-01 30
7 2017-07-01 31
8 2017-08-01 31
9 2017-09-01 30
10 2017-10-01 31
11 2017-11-01 30
12 2017-12-01 31

The package lubridate has a function called days_in_month(). So you can first convert the MONTH column into dates and then use days_in_month:
library(lubridate)
library(zoo)
df <- structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L, 11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L, 4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017", "Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017", "Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017", "Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017", "Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA, -24L))
df$no_of_days <- days_in_month(as.Date(as.yearmon(df$MONTH, "%b,%Y")))

Related

Explicit factor NAs in a data frame

I have the following data frame with ages binned in ranges of 5 years and the frequency of a condition happening in males/females. The problem is that there were no occurrences in either gender for example in the range 15-20.
structure(list(age = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), .Label = c("[0,5]",
"(5,10]", "(10,15]", "(15,20]", "(20,25]", "(25,30]", "(30,35]",
"(35,40]", "(40,45]", "(45,50]", "(50,55]", "(55,60]", "(60,65]",
"(65,70]", "(70,75]", "(75,80]", "(80,85]", "(85,90]", "(90,95]",
"(95,100]"), class = "factor"), male = c(2L, 1L, 1L, 4L, 8L,
9L, 20L, 33L, 49L, 104L, 112L, 176L, 159L, 140L, 94L, 72L, 32L,
6L, 2L), female = c(1L, 1L, NA, 7L, 7L, 4L, 23L, 39L, 44L, 74L,
94L, 111L, 124L, 129L, 110L, 92L, 76L, 30L, 7L)), row.names = c(NA,
-19L), groups = structure(list(age = structure(c(1L, 2L, 3L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L), .Label = c("[0,5]", "(5,10]", "(10,15]", "(15,20]",
"(20,25]", "(25,30]", "(30,35]", "(35,40]", "(40,45]", "(45,50]",
"(50,55]", "(55,60]", "(60,65]", "(65,70]", "(70,75]", "(75,80]",
"(80,85]", "(85,90]", "(90,95]", "(95,100]"), class = "factor"),
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -19L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
If I check the levels it properly shows all levels.
what I would want is a data frame where all ranges of ages show up and when they don't exist substitute then by 0.
You can use complete :
library(dplyr)
library(tidyr)
df %>%
ungroup %>%
complete(age, fill = list(male = 0, female = 0))
# age male female
# <fct> <dbl> <dbl>
# 1 [0,5] 2 1
# 2 (5,10] 1 1
# 3 (10,15] 1 0
# 4 (15,20] 0 0
# 5 (20,25] 4 7
# 6 (25,30] 8 7
# 7 (30,35] 9 4
# 8 (35,40] 20 23
# 9 (40,45] 33 39
#10 (45,50] 49 44
#11 (50,55] 104 74
#12 (55,60] 112 94
#13 (60,65] 176 111
#14 (65,70] 159 124
#15 (70,75] 140 129
#16 (75,80] 94 110
#17 (80,85] 72 92
#18 (85,90] 32 76
#19 (90,95] 6 30
#20 (95,100] 2 7

Replace multiple characters from multiple columns in R

Given a dataframe as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(2L, 16L, 23L, 24L, 4L, 6L, 7L, 9L, 10L, 12L,
14L, 17L, 1L, 22L, 3L, 5L, 8L, 11L, 13L, 15L, 18L, 19L, 20L,
21L), .Label = c("-", "--", "1058.18", "1455.6", "1539.01",
"1867.07", "2036.92", "2102.23", "2372.84", "2693.96", "2769.65",
"2973.04", "3146.88", "3227.23", "3604.71", "365.07", "3678.01",
"4043.18", "4438.55", "4860.76", "5360.94", "555.51", "653.19",
"980.72"), class = "factor"), c = structure(c(2L, 6L, 10L,
11L, 13L, 15L, 16L, 18L, 20L, 22L, 24L, 7L, 1L, 9L, 12L,
14L, 17L, 19L, 21L, 23L, 3L, 4L, 5L, 8L), .Label = c("-",
"--", "1092.73", "1222.48", "1409.07", "158.18", "1748.44",
"2179.42", "227.68", "268.53", "331.81", "366.95", "434.19",
"486.41", "538.49", "606.62", "614.75", "651.46", "729.44",
"736.55", "836.46", "890.81", "929.72", "981.65"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
How could I replace -- and - in only columns a and b with NA? Thanks.
You can use :
cols <- c('a', 'b')
df[cols][df[cols] == '--' | df[cols] == '-'] <- NA
Or using dplyr :
library(dplyr)
df %>% mutate(across(c(a, b), ~replace(., . %in% c('--', '-'), NA)))
I think it's better to try to avoid the data being read in like this in the first place, but if you need to correct it after, you can try using the na.strings argument in type.convert. Notice that it's na.strings with an "s" -- it's plural, so more than one value can be used to represent NA values.
df[c("a", "b")] <- lapply(df[c("a", "b")], type.convert, na.strings = c("--", "-"))
str(df)
# 'data.frame': 24 obs. of 4 variables:
# $ date: Factor w/ 24 levels "2010Y1-01m","2010Y1-02m",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ a : num NA 7363 8238 8831 9317 ...
# $ b : num NA 365 653 981 1456 ...
# $ c : Factor w/ 24 levels "-","--","1092.73",..: 2 6 10 11 13 15 16 18 20 22 ...
head(df)
# date a b c
# 1 2010Y1-01m NA NA --
# 2 2010Y1-02m 7363.45 365.07 158.18
# 3 2010Y1-03m 8237.71 653.19 268.53
# 4 2010Y1-04m 8830.99 980.72 331.81
# 5 2010Y1-05m 9316.56 1455.60 434.19
# 6 2010Y1-06m 9795.77 1867.07 538.49
Note that in this particular case, you could also use the side effect of as.numeric(as.character(...)) converting anything that can't be coerced to numeric to NA, but keep in mind that you will get a warning for each column that you use this approach on.
lapply(df[c("a", "b")], function(x) as.numeric(as.character(x)))

delete observations by days in R

My dataset has the next structure
df=structure(list(Data = structure(c(12L, 13L, 14L, 15L, 16L, 17L,
18L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L), .Label = c("01.01.2018",
"02.01.2018", "03.01.2018", "04.01.2018", "05.01.2018", "06.01.2018",
"07.01.2018", "12.02.2018", "13.02.2018", "14.02.2018", "15.02.2018",
"25.12.2017", "26.12.2017", "27.12.2017", "28.12.2017", "29.12.2017",
"30.12.2017", "31.12.2017"), class = "factor"), sku = 1:18, metric = c(100L,
210L, 320L, 430L, 540L, 650L, 760L, 870L, 980L, 1090L, 1200L,
1310L, 1420L, 1530L, 1640L, 1750L, 1860L, 1970L), action = c(0L,
0L, 0L, 0L, 0L, 0L, 0L, 0L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
1L)), .Names = c("Data", "sku", "metric", "action"), class = "data.frame", row.names = c(NA,
-18L))
I need to delete observations that have certain dates.
But in this dataset there is action variable. The action column has only two values 0 and 1.
Observations on these certain dates should be deleted only for the zero category of action.
these dates are presented in a separate datase.
datedata=structure(list(Data = structure(c(18L, 19L, 20L, 21L, 22L, 5L,
7L, 9L, 11L, 13L, 15L, 17L, 23L, 1L, 2L, 3L, 4L, 6L, 8L, 10L,
12L, 14L, 16L), .Label = c("01.05.2018", "02.05.2018", "03.05.2018",
"04.05.2018", "05.03.2018", "05.05.2018", "06.03.2018", "06.05.2018",
"07.03.2018", "07.05.2018", "08.03.2018", "08.05.2018", "09.03.2018",
"09.05.2018", "10.03.2018", "10.05.2018", "11.03.2018", "21.02.2018",
"22.02.2018", "23.02.2018", "24.02.2018", "25.02.2018", "30.04.2018"
), class = "factor")), .Names = "Data", class = "data.frame", row.names = c(NA,
-23L))
how can i do it?
A solution is to use dplyr::filter as:
library(dplyr)
library(lubridate)
df %>% mutate(Data = dmy(Data)) %>%
filter(action==1 | (action==0 & !(Data %in% dmy(datedata$Data))))
# Data sku metric action
# 1 2017-12-25 1 100 0
# 2 2017-12-26 2 210 0
# 3 2017-12-27 3 320 0
# 4 2017-12-28 4 430 0
# 5 2017-12-29 5 540 0
# 6 2017-12-30 6 650 0
# 7 2017-12-31 7 760 0
# 8 2018-01-01 8 870 0
# 9 2018-01-02 9 980 1
# 10 2018-01-03 10 1090 1
# 11 2018-01-04 11 1200 1
# 12 2018-01-05 12 1310 1
# 13 2018-01-06 13 1420 1
# 14 2018-01-07 14 1530 1
# 15 2018-02-12 15 1640 1
# 16 2018-02-13 16 1750 1
# 17 2018-02-14 17 1860 1
# 18 2018-02-15 18 1970 1
I guess this will work. Fist use match to see weather there is a match in the day of df and the day in datedata, then filter it
library (dplyr)
df <- df %>% mutate (Data.flag = match(Data,datedata$Data)) %>%
filter(!is.na(Data.flag) & action == 0)

merge based on an id with missing values and string

my df is shown below
mydf<- structure(list(IDs = c(11L, 16L, 19L, 21L, 22L, 24L, 42L, 43L,
51L), string1 = structure(c(1L, 8L, 7L, 2L, 4L, 9L, 6L, 3L, 5L
), .Label = c("b", "g", "hue", "hyu", "if", "jud", "ufhy", "uhgf;ffugf",
"uhgs"), class = "factor"), IDs.1 = c(4L, 11L, 16L, 19L, 20L,
22L, 29L, NA, NA), string2 = structure(c(2L, 3L, 8L, 7L, 4L,
5L, 6L, 1L, 1L), .Label = c("", "a", "b", "higf;hdugd", "hyu",
"inja", "ufhy", "uhgf;ffugf"), class = "factor")), .Names = c("IDs",
"string1", "IDs.1", "string2"), class = "data.frame", row.names = c(NA,
-9L))
I want to get them together like below
myout<- structure(list(Ids = c(4L, 11L, 16L, 19L, 20L, 21L, 22L, 24L,
29L, 42L, 43L, 51L), string = structure(c(1L, 2L, 11L, 10L, 4L,
3L, 6L, 12L, 8L, 9L, 5L, 7L), .Label = c("a", "b", "g", "higf;hdugd",
"hue", "hyu", "if", "inja", "jud", "ufhy", "uhgf;ffugf", "uhgs"
), class = "factor")), .Names = c("Ids", "string"), class = "data.frame", row.names = c(NA,
-12L))
I tried to do it using merge
df1 <- mydf[,1:2]
df2 <- mydf[,3:4]
df3 = merge(df1, df2, by.x=c("IDs", "string"))
which gives me an error because they are unequal
I also tried to use the approach given here
How to join (merge) data frames (inner, outer, left, right)? which did not solve my problem
my input is like this
IDs string1 IDs string2
11 b 4 a
16 uhgf;ffugf 11 b
19 ufhy 16 uhgf;ffugf
21 g 19 ufhy
22 hyu 20 higf;hdugd
24 uhgs 22 hyu
42 jud 29 inja
43 hue
51 if
and the output looks like this
Ids string
4 a
11 b
16 uhgf;ffugf
19 ufhy
20 higf;hdugd
21 g
22 hyu
24 uhgs
29 inja
42 jud
43 hue
51 if
e.g. 11, 16 etc are repeated twice , so we only want them once
We can do an rbind and remove the duplicated elements
library(data.table)
setnames(rbindlist(list(mydf[3:4], mydf[1:2]))[!is.na(IDs.1)&!duplicated(IDs.1)],
c("Ids", "string"))[order(Ids)]
# Ids string
# 1: 4 a
# 2: 11 b
# 3: 16 uhgf;ffugf
# 4: 19 ufhy
# 5: 20 higf;hdugd
# 6: 21 g
# 7: 22 hyu
# 8: 24 uhgs
# 9: 29 inja
#10: 42 jud
#11: 43 hue
#12: 51 if
Or another option is melt from data.table (to convert to 'long' format) which can take multiple measure patterns, then remove the duplicated 'Ids' and order using 'Ids'.
melt(setDT(mydf), measure = patterns("ID", "string"), na.rm=TRUE,
value.name = c("Ids", "string"))[!duplicated(Ids, fromLast=TRUE)
][, variable := NULL][order(Ids)]

Sum and replace columns with same name R for a data frame containing different classes

I have a data frame containing multiple classes, I would like to sum those columns that have the same name and are numeric, and replace the old columns with the new sum, does anyone know a way to do this?
i.e I have a data frame like:
col1 col2 col3 col3
char factor int int
I would like to produce
col1 col2 col3
char factor 2int
I have previously used:
data <- as.data.frame(do.call(cbind, by(t(data),INDICES=names(data),FUN=colSums)))
However this was on a dataframe that only had numeric variables.
There are other examples on the internet but not meeting the conditions of: replacement, preserving the rest of the frame, and of being on a frame with multiple classes
Similar question: how do I search for columns with same name, add the column values and replace these columns with same name by their sum? Using R
Try
dat1 <- dat #to keep a copy of the original dataset
indx <- sapply(dat, is.numeric)#check which columns are numeric
nm1 <- which(indx)#get the numeric index of the column
indx2 <- duplicated(names(nm1))#check which among the
# integer columns are duplicated
#use `Map` after splitting the "nm1" with its "names", do the `rowSums`
dat[ nm1[!indx2]] <- Map(function(x,y) rowSums(x[y]), list(dat),
split(nm1, names(nm1)))
dat[ -nm1[indx2]]
Update
Or to make it more efficient, only take the "duplicated" and "numeric" columns while leaving the others intact. Create an "index" (indx2) of columns that are duplicated. Subset the "nm1" based on the "indx2" and then do rowSums as described above. Finally, remove the unwanted columns (duplicated ones) by using the "indx3"
indx2 <- duplicated(names(nm1))|duplicated(names(nm1),fromLast=TRUE)
nm2 <- nm1[indx2]
indx3 <- duplicated(names(nm2))
dat[nm2[!indx3]] <- Map(function(x,y) rowSums(x[y]),
list(dat),split(nm2, names(nm2)))
datN <- dat[ -nm2[indx3]]
datN
# col1 col2 col3 col4 col5
#1 16 23 2 10 10
#2 10 18 12 8 18
#3 21 23 15 6 10
#4 14 37 3 5 15
#5 29 39 5 1 11
#6 26 31 14 2 20
#7 25 31 2 8 10
#8 36 31 12 8 6
#9 32 26 13 6 4
#10 16 38 1 7 3
Checking the results
rowSums(dat1[names(dat1) %in% 'col1'])
#[1] 16 10 21 14 29 26 25 36 32 16
rowSums(dat1[names(dat1) %in% 'col2'])
#[1] 23 18 23 37 39 31 31 31 26 38
data
dat <- structure(list(col1 = c(6L, 5L, 15L, 11L, 14L, 19L, 6L, 16L,
17L, 6L), col2 = c(13L, 8L, 14L, 14L, 7L, 19L, 4L, 1L, 11L, 3L
), col3 = structure(c(2L, 5L, 8L, 3L, 4L, 7L, 2L, 5L, 6L, 1L), .Label = c("1",
"2", "3", "5", "12", "13", "14", "15"), class = "factor"), col2 = c(7L,
5L, 8L, 3L, 19L, 5L, 15L, 13L, 14L, 20L), col4 = structure(c(7L,
6L, 4L, 3L, 1L, 2L, 6L, 6L, 4L, 5L), .Label = c("1", "2", "5",
"6", "7", "8", "10"), class = "factor"), col5 = c(10L, 18L, 10L,
15L, 11L, 20L, 10L, 6L, 4L, 3L), col1 = c(10L, 5L, 6L, 3L, 15L,
7L, 19L, 20L, 15L, 10L), col2 = c(3L, 5L, 1L, 20L, 13L, 7L, 12L,
17L, 1L, 15L)), .Names = c("col1", "col2", "col3", "col2", "col4",
"col5", "col1", "col2"), row.names = c(NA, -10L), class = "data.frame")

Resources