Related
I have the following data frame with ages binned in ranges of 5 years and the frequency of a condition happening in males/females. The problem is that there were no occurrences in either gender for example in the range 15-20.
structure(list(age = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), .Label = c("[0,5]",
"(5,10]", "(10,15]", "(15,20]", "(20,25]", "(25,30]", "(30,35]",
"(35,40]", "(40,45]", "(45,50]", "(50,55]", "(55,60]", "(60,65]",
"(65,70]", "(70,75]", "(75,80]", "(80,85]", "(85,90]", "(90,95]",
"(95,100]"), class = "factor"), male = c(2L, 1L, 1L, 4L, 8L,
9L, 20L, 33L, 49L, 104L, 112L, 176L, 159L, 140L, 94L, 72L, 32L,
6L, 2L), female = c(1L, 1L, NA, 7L, 7L, 4L, 23L, 39L, 44L, 74L,
94L, 111L, 124L, 129L, 110L, 92L, 76L, 30L, 7L)), row.names = c(NA,
-19L), groups = structure(list(age = structure(c(1L, 2L, 3L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L), .Label = c("[0,5]", "(5,10]", "(10,15]", "(15,20]",
"(20,25]", "(25,30]", "(30,35]", "(35,40]", "(40,45]", "(45,50]",
"(50,55]", "(55,60]", "(60,65]", "(65,70]", "(70,75]", "(75,80]",
"(80,85]", "(85,90]", "(90,95]", "(95,100]"), class = "factor"),
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -19L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
If I check the levels it properly shows all levels.
what I would want is a data frame where all ranges of ages show up and when they don't exist substitute then by 0.
You can use complete :
library(dplyr)
library(tidyr)
df %>%
ungroup %>%
complete(age, fill = list(male = 0, female = 0))
# age male female
# <fct> <dbl> <dbl>
# 1 [0,5] 2 1
# 2 (5,10] 1 1
# 3 (10,15] 1 0
# 4 (15,20] 0 0
# 5 (20,25] 4 7
# 6 (25,30] 8 7
# 7 (30,35] 9 4
# 8 (35,40] 20 23
# 9 (40,45] 33 39
#10 (45,50] 49 44
#11 (50,55] 104 74
#12 (55,60] 112 94
#13 (60,65] 176 111
#14 (65,70] 159 124
#15 (70,75] 140 129
#16 (75,80] 94 110
#17 (80,85] 72 92
#18 (85,90] 32 76
#19 (90,95] 6 30
#20 (95,100] 2 7
Given a dataframe as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(2L, 16L, 23L, 24L, 4L, 6L, 7L, 9L, 10L, 12L,
14L, 17L, 1L, 22L, 3L, 5L, 8L, 11L, 13L, 15L, 18L, 19L, 20L,
21L), .Label = c("-", "--", "1058.18", "1455.6", "1539.01",
"1867.07", "2036.92", "2102.23", "2372.84", "2693.96", "2769.65",
"2973.04", "3146.88", "3227.23", "3604.71", "365.07", "3678.01",
"4043.18", "4438.55", "4860.76", "5360.94", "555.51", "653.19",
"980.72"), class = "factor"), c = structure(c(2L, 6L, 10L,
11L, 13L, 15L, 16L, 18L, 20L, 22L, 24L, 7L, 1L, 9L, 12L,
14L, 17L, 19L, 21L, 23L, 3L, 4L, 5L, 8L), .Label = c("-",
"--", "1092.73", "1222.48", "1409.07", "158.18", "1748.44",
"2179.42", "227.68", "268.53", "331.81", "366.95", "434.19",
"486.41", "538.49", "606.62", "614.75", "651.46", "729.44",
"736.55", "836.46", "890.81", "929.72", "981.65"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
How could I replace -- and - in only columns a and b with NA? Thanks.
You can use :
cols <- c('a', 'b')
df[cols][df[cols] == '--' | df[cols] == '-'] <- NA
Or using dplyr :
library(dplyr)
df %>% mutate(across(c(a, b), ~replace(., . %in% c('--', '-'), NA)))
I think it's better to try to avoid the data being read in like this in the first place, but if you need to correct it after, you can try using the na.strings argument in type.convert. Notice that it's na.strings with an "s" -- it's plural, so more than one value can be used to represent NA values.
df[c("a", "b")] <- lapply(df[c("a", "b")], type.convert, na.strings = c("--", "-"))
str(df)
# 'data.frame': 24 obs. of 4 variables:
# $ date: Factor w/ 24 levels "2010Y1-01m","2010Y1-02m",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ a : num NA 7363 8238 8831 9317 ...
# $ b : num NA 365 653 981 1456 ...
# $ c : Factor w/ 24 levels "-","--","1092.73",..: 2 6 10 11 13 15 16 18 20 22 ...
head(df)
# date a b c
# 1 2010Y1-01m NA NA --
# 2 2010Y1-02m 7363.45 365.07 158.18
# 3 2010Y1-03m 8237.71 653.19 268.53
# 4 2010Y1-04m 8830.99 980.72 331.81
# 5 2010Y1-05m 9316.56 1455.60 434.19
# 6 2010Y1-06m 9795.77 1867.07 538.49
Note that in this particular case, you could also use the side effect of as.numeric(as.character(...)) converting anything that can't be coerced to numeric to NA, but keep in mind that you will get a warning for each column that you use this approach on.
lapply(df[c("a", "b")], function(x) as.numeric(as.character(x)))
This question already has answers here:
Find out the number of days of a month in R
(15 answers)
Closed 3 years ago.
I have column - Month, and there are few months, I need to find out the number of day in that month.
I need to find the number of days in a month considering the leap year and that the month can go to June 2019, or current month. The logic should be applied on the column.
MONTH no_of_days
Jan,2017
Feb,2017
Mar,2017
Apr,2017
May,2017
Jun,2017
Jul,2017
Aug,2017
Sep,2017
Oct,2017
Nov,2017
Dec,2017
structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L,
11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L,
4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017",
"Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017",
"Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017",
"Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017",
"Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
You need to first get the column MONTH to Date and then use any of the function shown here to get number of days in a month.
as.Date(paste0(1, df$MONTH), "%d%b,%Y")
#[1] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01" "2017-06-01"
# "2017-07-01" "2017-08-01" "2017-09-01" "2017-10-01" "2017-11-01" "2017-12-01"
Hmisc::monthDays(as.Date(paste0(1, df$MONTH), "%d%b,%Y"))
#[1] 31 28 31 30 31 30 31 31 30 31 30 31
data
df <- structure(list(MONTH = structure(c(5L, 4L, 8L, 1L, 9L, 7L, 6L,
2L, 12L, 11L, 10L, 3L), .Label = c("Apr,2017", "Aug,2017", "Dec,2017",
"Feb,2017", "Jan,2017", "Jul,2017", "Jun,2017", "Mar,2017", "May,2017",
"Nov,2017", "Oct,2017", "Sep,2017"), class = "factor")), class =
"data.frame", row.names = c(NA, -12L))
You'll need to wrangle your MONTH column into a date object, then you can use lubridate::days_in_month():
if (!require(lubridate)) install.packages('lubridate')
if (!require(dplyr)) install.packages('dplyr')
library(lubridate)
library(dplyr)
dat %>%
mutate(MONTH = paste("01", MONTH, sep = ",") %>% dmy(),
no_of_days = days_in_month(MONTH))
# A tibble: 12 x 2
MONTH no_of_days
<date> <int>
1 2017-01-01 31
2 2017-02-01 28
3 2017-03-01 31
4 2017-04-01 30
5 2017-05-01 31
6 2017-06-01 30
7 2017-07-01 31
8 2017-08-01 31
9 2017-09-01 30
10 2017-10-01 31
11 2017-11-01 30
12 2017-12-01 31
The package lubridate has a function called days_in_month(). So you can first convert the MONTH column into dates and then use days_in_month:
library(lubridate)
library(zoo)
df <- structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L, 11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L, 4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017", "Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017", "Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017", "Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017", "Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA, -24L))
df$no_of_days <- days_in_month(as.Date(as.yearmon(df$MONTH, "%b,%Y")))
This is the function:
remove_column <- function(column_vector) {
for (column in column_vector) {
if (grepl('.y$', column)) {
mydata$column <- NULL
}
}
}
What I think it'd doing: I'm passing a vector of my column names to the function, it's looping through list of names and asking whether the last characters of each column name are ".y". If that is the case, the function eliminates the column.
I've tried putting prints here and there to see my vector and to see whether the conditional evaluates to TRUE or FALSE, and everything seems to be working fine, but for some reason, it doesn't get rid of the column.
The following function returns my column vector:
duplicate_names <- function(col_names) {
duplicates <- c()
for (name in col_names) {
# split by period i.e. colname.x would be [colname, x]
if (lengths(strsplit(name, '\\.')) > 1) {
duplicates <- c(duplicates, name)
}
}
return(duplicates)
}
I usually call it like this:
duplicate_names(names(mydata))
This is what the vector of columns looks like:
c('v1.x', 'v2.y')
When I print the function it returns the following:
[1] "v1.x" "v2.y"
As requested by a user, the dput(droplevels(horsedata[1:5, 1:5])) (data that I am using for this):
dput(droplevels(horsedata[1:5, 1:5]))
structure(list(ÿþhorse_name = structure(c(3L, 1L, 2L, 4L, 5L), .Label = c("IM PRETTY FAMES",
"JESS ROYAL BUCKS", "KISS ME IM SUGAR", "LOLAMO", "RUN MADISON RUN"
), class = "factor"), owner_name = structure(c(3L, 2L, 1L, 5L,
4L), .Label = c("Christine Tavares", "Heste Sport, Inc.", "Picov Cattle Co.",
"Procter, Wayne and Carol", "Ruth F. Barbour"), class = "factor"),
program = structure(1:5, .Label = c("1", "2", "3", "4", "5"
), class = "factor"), pp = 1:5, todays_cls = c(61L, 61L,
61L, 61L, 61L)), .Names = c("ÿþhorse_name", "owner_name",
"program", "pp", "todays_cls"), row.names = c(NA, 5L), class = "data.frame")
We don't need a loop to subset the columns.
mydata[!grepl('\\.y$', column_list)]
If there are other columns not in the column_list and we want to keep them (assuming that the 'column_list' is ordered)
mydata[setdiff(1:ncol(mydata), grep('\\.y$', column_list))]
We can modify your function by
changing .y$ to \\.y$ as . means any character and not just the dot
Instead of $, we use [ to subset the dataset
Return the dataset after the assignment
remove_column <- function(dat, column_vec) {
for (column in column_vec) {
if (grepl('\\.y$', column, perl=TRUE)) {
dat[column] <- NULL
}
}
dat
}
remove_column(mydata, column_list)
# v1.x v2.x v3
#1 6 1 9
#2 4 11 7
#3 14 15 5
#4 10 2 4
#5 13 4 0
#6 19 14 1
#7 5 1 8
#8 16 12 7
#9 16 13 5
#10 5 0 7
data
mydata <- structure(list(v1.x = c(6L, 4L, 14L, 10L, 13L, 19L, 5L, 16L,
16L, 5L), v1.y = c(12L, 7L, 14L, 14L, 6L, 18L, 4L, 0L, 10L, 2L
), v2.x = c(1L, 11L, 15L, 2L, 4L, 14L, 1L, 12L, 13L, 0L), v2.y = c(6L,
5L, 7L, 3L, 19L, 4L, 15L, 13L, 14L, 20L), v3 = c(9L, 7L, 5L,
4L, 0L, 1L, 8L, 7L, 5L, 7L)), .Names = c("v1.x", "v1.y", "v2.x",
"v2.y", "v3"), row.names = c(NA, -10L), class = "data.frame")
column_list <- c('v1.x', 'v1.y', 'v2.x', 'v2.y')
I have a data frame containing multiple classes, I would like to sum those columns that have the same name and are numeric, and replace the old columns with the new sum, does anyone know a way to do this?
i.e I have a data frame like:
col1 col2 col3 col3
char factor int int
I would like to produce
col1 col2 col3
char factor 2int
I have previously used:
data <- as.data.frame(do.call(cbind, by(t(data),INDICES=names(data),FUN=colSums)))
However this was on a dataframe that only had numeric variables.
There are other examples on the internet but not meeting the conditions of: replacement, preserving the rest of the frame, and of being on a frame with multiple classes
Similar question: how do I search for columns with same name, add the column values and replace these columns with same name by their sum? Using R
Try
dat1 <- dat #to keep a copy of the original dataset
indx <- sapply(dat, is.numeric)#check which columns are numeric
nm1 <- which(indx)#get the numeric index of the column
indx2 <- duplicated(names(nm1))#check which among the
# integer columns are duplicated
#use `Map` after splitting the "nm1" with its "names", do the `rowSums`
dat[ nm1[!indx2]] <- Map(function(x,y) rowSums(x[y]), list(dat),
split(nm1, names(nm1)))
dat[ -nm1[indx2]]
Update
Or to make it more efficient, only take the "duplicated" and "numeric" columns while leaving the others intact. Create an "index" (indx2) of columns that are duplicated. Subset the "nm1" based on the "indx2" and then do rowSums as described above. Finally, remove the unwanted columns (duplicated ones) by using the "indx3"
indx2 <- duplicated(names(nm1))|duplicated(names(nm1),fromLast=TRUE)
nm2 <- nm1[indx2]
indx3 <- duplicated(names(nm2))
dat[nm2[!indx3]] <- Map(function(x,y) rowSums(x[y]),
list(dat),split(nm2, names(nm2)))
datN <- dat[ -nm2[indx3]]
datN
# col1 col2 col3 col4 col5
#1 16 23 2 10 10
#2 10 18 12 8 18
#3 21 23 15 6 10
#4 14 37 3 5 15
#5 29 39 5 1 11
#6 26 31 14 2 20
#7 25 31 2 8 10
#8 36 31 12 8 6
#9 32 26 13 6 4
#10 16 38 1 7 3
Checking the results
rowSums(dat1[names(dat1) %in% 'col1'])
#[1] 16 10 21 14 29 26 25 36 32 16
rowSums(dat1[names(dat1) %in% 'col2'])
#[1] 23 18 23 37 39 31 31 31 26 38
data
dat <- structure(list(col1 = c(6L, 5L, 15L, 11L, 14L, 19L, 6L, 16L,
17L, 6L), col2 = c(13L, 8L, 14L, 14L, 7L, 19L, 4L, 1L, 11L, 3L
), col3 = structure(c(2L, 5L, 8L, 3L, 4L, 7L, 2L, 5L, 6L, 1L), .Label = c("1",
"2", "3", "5", "12", "13", "14", "15"), class = "factor"), col2 = c(7L,
5L, 8L, 3L, 19L, 5L, 15L, 13L, 14L, 20L), col4 = structure(c(7L,
6L, 4L, 3L, 1L, 2L, 6L, 6L, 4L, 5L), .Label = c("1", "2", "5",
"6", "7", "8", "10"), class = "factor"), col5 = c(10L, 18L, 10L,
15L, 11L, 20L, 10L, 6L, 4L, 3L), col1 = c(10L, 5L, 6L, 3L, 15L,
7L, 19L, 20L, 15L, 10L), col2 = c(3L, 5L, 1L, 20L, 13L, 7L, 12L,
17L, 1L, 15L)), .Names = c("col1", "col2", "col3", "col2", "col4",
"col5", "col1", "col2"), row.names = c(NA, -10L), class = "data.frame")