merge based on an id with missing values and string - r

my df is shown below
mydf<- structure(list(IDs = c(11L, 16L, 19L, 21L, 22L, 24L, 42L, 43L,
51L), string1 = structure(c(1L, 8L, 7L, 2L, 4L, 9L, 6L, 3L, 5L
), .Label = c("b", "g", "hue", "hyu", "if", "jud", "ufhy", "uhgf;ffugf",
"uhgs"), class = "factor"), IDs.1 = c(4L, 11L, 16L, 19L, 20L,
22L, 29L, NA, NA), string2 = structure(c(2L, 3L, 8L, 7L, 4L,
5L, 6L, 1L, 1L), .Label = c("", "a", "b", "higf;hdugd", "hyu",
"inja", "ufhy", "uhgf;ffugf"), class = "factor")), .Names = c("IDs",
"string1", "IDs.1", "string2"), class = "data.frame", row.names = c(NA,
-9L))
I want to get them together like below
myout<- structure(list(Ids = c(4L, 11L, 16L, 19L, 20L, 21L, 22L, 24L,
29L, 42L, 43L, 51L), string = structure(c(1L, 2L, 11L, 10L, 4L,
3L, 6L, 12L, 8L, 9L, 5L, 7L), .Label = c("a", "b", "g", "higf;hdugd",
"hue", "hyu", "if", "inja", "jud", "ufhy", "uhgf;ffugf", "uhgs"
), class = "factor")), .Names = c("Ids", "string"), class = "data.frame", row.names = c(NA,
-12L))
I tried to do it using merge
df1 <- mydf[,1:2]
df2 <- mydf[,3:4]
df3 = merge(df1, df2, by.x=c("IDs", "string"))
which gives me an error because they are unequal
I also tried to use the approach given here
How to join (merge) data frames (inner, outer, left, right)? which did not solve my problem
my input is like this
IDs string1 IDs string2
11 b 4 a
16 uhgf;ffugf 11 b
19 ufhy 16 uhgf;ffugf
21 g 19 ufhy
22 hyu 20 higf;hdugd
24 uhgs 22 hyu
42 jud 29 inja
43 hue
51 if
and the output looks like this
Ids string
4 a
11 b
16 uhgf;ffugf
19 ufhy
20 higf;hdugd
21 g
22 hyu
24 uhgs
29 inja
42 jud
43 hue
51 if
e.g. 11, 16 etc are repeated twice , so we only want them once

We can do an rbind and remove the duplicated elements
library(data.table)
setnames(rbindlist(list(mydf[3:4], mydf[1:2]))[!is.na(IDs.1)&!duplicated(IDs.1)],
c("Ids", "string"))[order(Ids)]
# Ids string
# 1: 4 a
# 2: 11 b
# 3: 16 uhgf;ffugf
# 4: 19 ufhy
# 5: 20 higf;hdugd
# 6: 21 g
# 7: 22 hyu
# 8: 24 uhgs
# 9: 29 inja
#10: 42 jud
#11: 43 hue
#12: 51 if
Or another option is melt from data.table (to convert to 'long' format) which can take multiple measure patterns, then remove the duplicated 'Ids' and order using 'Ids'.
melt(setDT(mydf), measure = patterns("ID", "string"), na.rm=TRUE,
value.name = c("Ids", "string"))[!duplicated(Ids, fromLast=TRUE)
][, variable := NULL][order(Ids)]

Related

Explicit factor NAs in a data frame

I have the following data frame with ages binned in ranges of 5 years and the frequency of a condition happening in males/females. The problem is that there were no occurrences in either gender for example in the range 15-20.
structure(list(age = structure(c(1L, 2L, 3L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L, 20L), .Label = c("[0,5]",
"(5,10]", "(10,15]", "(15,20]", "(20,25]", "(25,30]", "(30,35]",
"(35,40]", "(40,45]", "(45,50]", "(50,55]", "(55,60]", "(60,65]",
"(65,70]", "(70,75]", "(75,80]", "(80,85]", "(85,90]", "(90,95]",
"(95,100]"), class = "factor"), male = c(2L, 1L, 1L, 4L, 8L,
9L, 20L, 33L, 49L, 104L, 112L, 176L, 159L, 140L, 94L, 72L, 32L,
6L, 2L), female = c(1L, 1L, NA, 7L, 7L, 4L, 23L, 39L, 44L, 74L,
94L, 111L, 124L, 129L, 110L, 92L, 76L, 30L, 7L)), row.names = c(NA,
-19L), groups = structure(list(age = structure(c(1L, 2L, 3L,
5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L,
19L, 20L), .Label = c("[0,5]", "(5,10]", "(10,15]", "(15,20]",
"(20,25]", "(25,30]", "(30,35]", "(35,40]", "(40,45]", "(45,50]",
"(50,55]", "(55,60]", "(60,65]", "(65,70]", "(70,75]", "(75,80]",
"(80,85]", "(85,90]", "(90,95]", "(95,100]"), class = "factor"),
.rows = structure(list(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L,
10L, 11L, 12L, 13L, 14L, 15L, 16L, 17L, 18L, 19L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), row.names = c(NA, -19L), class = c("tbl_df",
"tbl", "data.frame"), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
If I check the levels it properly shows all levels.
what I would want is a data frame where all ranges of ages show up and when they don't exist substitute then by 0.
You can use complete :
library(dplyr)
library(tidyr)
df %>%
ungroup %>%
complete(age, fill = list(male = 0, female = 0))
# age male female
# <fct> <dbl> <dbl>
# 1 [0,5] 2 1
# 2 (5,10] 1 1
# 3 (10,15] 1 0
# 4 (15,20] 0 0
# 5 (20,25] 4 7
# 6 (25,30] 8 7
# 7 (30,35] 9 4
# 8 (35,40] 20 23
# 9 (40,45] 33 39
#10 (45,50] 49 44
#11 (50,55] 104 74
#12 (55,60] 112 94
#13 (60,65] 176 111
#14 (65,70] 159 124
#15 (70,75] 140 129
#16 (75,80] 94 110
#17 (80,85] 72 92
#18 (85,90] 32 76
#19 (90,95] 6 30
#20 (95,100] 2 7

Replace multiple characters from multiple columns in R

Given a dataframe as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(2L, 16L, 23L, 24L, 4L, 6L, 7L, 9L, 10L, 12L,
14L, 17L, 1L, 22L, 3L, 5L, 8L, 11L, 13L, 15L, 18L, 19L, 20L,
21L), .Label = c("-", "--", "1058.18", "1455.6", "1539.01",
"1867.07", "2036.92", "2102.23", "2372.84", "2693.96", "2769.65",
"2973.04", "3146.88", "3227.23", "3604.71", "365.07", "3678.01",
"4043.18", "4438.55", "4860.76", "5360.94", "555.51", "653.19",
"980.72"), class = "factor"), c = structure(c(2L, 6L, 10L,
11L, 13L, 15L, 16L, 18L, 20L, 22L, 24L, 7L, 1L, 9L, 12L,
14L, 17L, 19L, 21L, 23L, 3L, 4L, 5L, 8L), .Label = c("-",
"--", "1092.73", "1222.48", "1409.07", "158.18", "1748.44",
"2179.42", "227.68", "268.53", "331.81", "366.95", "434.19",
"486.41", "538.49", "606.62", "614.75", "651.46", "729.44",
"736.55", "836.46", "890.81", "929.72", "981.65"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
How could I replace -- and - in only columns a and b with NA? Thanks.
You can use :
cols <- c('a', 'b')
df[cols][df[cols] == '--' | df[cols] == '-'] <- NA
Or using dplyr :
library(dplyr)
df %>% mutate(across(c(a, b), ~replace(., . %in% c('--', '-'), NA)))
I think it's better to try to avoid the data being read in like this in the first place, but if you need to correct it after, you can try using the na.strings argument in type.convert. Notice that it's na.strings with an "s" -- it's plural, so more than one value can be used to represent NA values.
df[c("a", "b")] <- lapply(df[c("a", "b")], type.convert, na.strings = c("--", "-"))
str(df)
# 'data.frame': 24 obs. of 4 variables:
# $ date: Factor w/ 24 levels "2010Y1-01m","2010Y1-02m",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ a : num NA 7363 8238 8831 9317 ...
# $ b : num NA 365 653 981 1456 ...
# $ c : Factor w/ 24 levels "-","--","1092.73",..: 2 6 10 11 13 15 16 18 20 22 ...
head(df)
# date a b c
# 1 2010Y1-01m NA NA --
# 2 2010Y1-02m 7363.45 365.07 158.18
# 3 2010Y1-03m 8237.71 653.19 268.53
# 4 2010Y1-04m 8830.99 980.72 331.81
# 5 2010Y1-05m 9316.56 1455.60 434.19
# 6 2010Y1-06m 9795.77 1867.07 538.49
Note that in this particular case, you could also use the side effect of as.numeric(as.character(...)) converting anything that can't be coerced to numeric to NA, but keep in mind that you will get a warning for each column that you use this approach on.
lapply(df[c("a", "b")], function(x) as.numeric(as.character(x)))

Get the number of days from the date column [duplicate]

This question already has answers here:
Find out the number of days of a month in R
(15 answers)
Closed 3 years ago.
I have column - Month, and there are few months, I need to find out the number of day in that month.
I need to find the number of days in a month considering the leap year and that the month can go to June 2019, or current month. The logic should be applied on the column.
MONTH no_of_days
Jan,2017
Feb,2017
Mar,2017
Apr,2017
May,2017
Jun,2017
Jul,2017
Aug,2017
Sep,2017
Oct,2017
Nov,2017
Dec,2017
structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L,
11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L,
4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017",
"Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017",
"Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017",
"Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017",
"Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
You need to first get the column MONTH to Date and then use any of the function shown here to get number of days in a month.
as.Date(paste0(1, df$MONTH), "%d%b,%Y")
#[1] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01" "2017-06-01"
# "2017-07-01" "2017-08-01" "2017-09-01" "2017-10-01" "2017-11-01" "2017-12-01"
Hmisc::monthDays(as.Date(paste0(1, df$MONTH), "%d%b,%Y"))
#[1] 31 28 31 30 31 30 31 31 30 31 30 31
data
df <- structure(list(MONTH = structure(c(5L, 4L, 8L, 1L, 9L, 7L, 6L,
2L, 12L, 11L, 10L, 3L), .Label = c("Apr,2017", "Aug,2017", "Dec,2017",
"Feb,2017", "Jan,2017", "Jul,2017", "Jun,2017", "Mar,2017", "May,2017",
"Nov,2017", "Oct,2017", "Sep,2017"), class = "factor")), class =
"data.frame", row.names = c(NA, -12L))
You'll need to wrangle your MONTH column into a date object, then you can use lubridate::days_in_month():
if (!require(lubridate)) install.packages('lubridate')
if (!require(dplyr)) install.packages('dplyr')
library(lubridate)
library(dplyr)
dat %>%
mutate(MONTH = paste("01", MONTH, sep = ",") %>% dmy(),
no_of_days = days_in_month(MONTH))
# A tibble: 12 x 2
MONTH no_of_days
<date> <int>
1 2017-01-01 31
2 2017-02-01 28
3 2017-03-01 31
4 2017-04-01 30
5 2017-05-01 31
6 2017-06-01 30
7 2017-07-01 31
8 2017-08-01 31
9 2017-09-01 30
10 2017-10-01 31
11 2017-11-01 30
12 2017-12-01 31
The package lubridate has a function called days_in_month(). So you can first convert the MONTH column into dates and then use days_in_month:
library(lubridate)
library(zoo)
df <- structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L, 11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L, 4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017", "Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017", "Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017", "Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017", "Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA, -24L))
df$no_of_days <- days_in_month(as.Date(as.yearmon(df$MONTH, "%b,%Y")))

'mydata$column <- NULL' not working in function

This is the function:
remove_column <- function(column_vector) {
for (column in column_vector) {
if (grepl('.y$', column)) {
mydata$column <- NULL
}
}
}
What I think it'd doing: I'm passing a vector of my column names to the function, it's looping through list of names and asking whether the last characters of each column name are ".y". If that is the case, the function eliminates the column.
I've tried putting prints here and there to see my vector and to see whether the conditional evaluates to TRUE or FALSE, and everything seems to be working fine, but for some reason, it doesn't get rid of the column.
The following function returns my column vector:
duplicate_names <- function(col_names) {
duplicates <- c()
for (name in col_names) {
# split by period i.e. colname.x would be [colname, x]
if (lengths(strsplit(name, '\\.')) > 1) {
duplicates <- c(duplicates, name)
}
}
return(duplicates)
}
I usually call it like this:
duplicate_names(names(mydata))
This is what the vector of columns looks like:
c('v1.x', 'v2.y')
When I print the function it returns the following:
[1] "v1.x" "v2.y"
As requested by a user, the dput(droplevels(horsedata[1:5, 1:5])) (data that I am using for this):
dput(droplevels(horsedata[1:5, 1:5]))
structure(list(ÿþhorse_name = structure(c(3L, 1L, 2L, 4L, 5L), .Label = c("IM PRETTY FAMES",
"JESS ROYAL BUCKS", "KISS ME IM SUGAR", "LOLAMO", "RUN MADISON RUN"
), class = "factor"), owner_name = structure(c(3L, 2L, 1L, 5L,
4L), .Label = c("Christine Tavares", "Heste Sport, Inc.", "Picov Cattle Co.",
"Procter, Wayne and Carol", "Ruth F. Barbour"), class = "factor"),
program = structure(1:5, .Label = c("1", "2", "3", "4", "5"
), class = "factor"), pp = 1:5, todays_cls = c(61L, 61L,
61L, 61L, 61L)), .Names = c("ÿþhorse_name", "owner_name",
"program", "pp", "todays_cls"), row.names = c(NA, 5L), class = "data.frame")
We don't need a loop to subset the columns.
mydata[!grepl('\\.y$', column_list)]
If there are other columns not in the column_list and we want to keep them (assuming that the 'column_list' is ordered)
mydata[setdiff(1:ncol(mydata), grep('\\.y$', column_list))]
We can modify your function by
changing .y$ to \\.y$ as . means any character and not just the dot
Instead of $, we use [ to subset the dataset
Return the dataset after the assignment
remove_column <- function(dat, column_vec) {
for (column in column_vec) {
if (grepl('\\.y$', column, perl=TRUE)) {
dat[column] <- NULL
}
}
dat
}
remove_column(mydata, column_list)
# v1.x v2.x v3
#1 6 1 9
#2 4 11 7
#3 14 15 5
#4 10 2 4
#5 13 4 0
#6 19 14 1
#7 5 1 8
#8 16 12 7
#9 16 13 5
#10 5 0 7
data
mydata <- structure(list(v1.x = c(6L, 4L, 14L, 10L, 13L, 19L, 5L, 16L,
16L, 5L), v1.y = c(12L, 7L, 14L, 14L, 6L, 18L, 4L, 0L, 10L, 2L
), v2.x = c(1L, 11L, 15L, 2L, 4L, 14L, 1L, 12L, 13L, 0L), v2.y = c(6L,
5L, 7L, 3L, 19L, 4L, 15L, 13L, 14L, 20L), v3 = c(9L, 7L, 5L,
4L, 0L, 1L, 8L, 7L, 5L, 7L)), .Names = c("v1.x", "v1.y", "v2.x",
"v2.y", "v3"), row.names = c(NA, -10L), class = "data.frame")
column_list <- c('v1.x', 'v1.y', 'v2.x', 'v2.y')

Sum and replace columns with same name R for a data frame containing different classes

I have a data frame containing multiple classes, I would like to sum those columns that have the same name and are numeric, and replace the old columns with the new sum, does anyone know a way to do this?
i.e I have a data frame like:
col1 col2 col3 col3
char factor int int
I would like to produce
col1 col2 col3
char factor 2int
I have previously used:
data <- as.data.frame(do.call(cbind, by(t(data),INDICES=names(data),FUN=colSums)))
However this was on a dataframe that only had numeric variables.
There are other examples on the internet but not meeting the conditions of: replacement, preserving the rest of the frame, and of being on a frame with multiple classes
Similar question: how do I search for columns with same name, add the column values and replace these columns with same name by their sum? Using R
Try
dat1 <- dat #to keep a copy of the original dataset
indx <- sapply(dat, is.numeric)#check which columns are numeric
nm1 <- which(indx)#get the numeric index of the column
indx2 <- duplicated(names(nm1))#check which among the
# integer columns are duplicated
#use `Map` after splitting the "nm1" with its "names", do the `rowSums`
dat[ nm1[!indx2]] <- Map(function(x,y) rowSums(x[y]), list(dat),
split(nm1, names(nm1)))
dat[ -nm1[indx2]]
Update
Or to make it more efficient, only take the "duplicated" and "numeric" columns while leaving the others intact. Create an "index" (indx2) of columns that are duplicated. Subset the "nm1" based on the "indx2" and then do rowSums as described above. Finally, remove the unwanted columns (duplicated ones) by using the "indx3"
indx2 <- duplicated(names(nm1))|duplicated(names(nm1),fromLast=TRUE)
nm2 <- nm1[indx2]
indx3 <- duplicated(names(nm2))
dat[nm2[!indx3]] <- Map(function(x,y) rowSums(x[y]),
list(dat),split(nm2, names(nm2)))
datN <- dat[ -nm2[indx3]]
datN
# col1 col2 col3 col4 col5
#1 16 23 2 10 10
#2 10 18 12 8 18
#3 21 23 15 6 10
#4 14 37 3 5 15
#5 29 39 5 1 11
#6 26 31 14 2 20
#7 25 31 2 8 10
#8 36 31 12 8 6
#9 32 26 13 6 4
#10 16 38 1 7 3
Checking the results
rowSums(dat1[names(dat1) %in% 'col1'])
#[1] 16 10 21 14 29 26 25 36 32 16
rowSums(dat1[names(dat1) %in% 'col2'])
#[1] 23 18 23 37 39 31 31 31 26 38
data
dat <- structure(list(col1 = c(6L, 5L, 15L, 11L, 14L, 19L, 6L, 16L,
17L, 6L), col2 = c(13L, 8L, 14L, 14L, 7L, 19L, 4L, 1L, 11L, 3L
), col3 = structure(c(2L, 5L, 8L, 3L, 4L, 7L, 2L, 5L, 6L, 1L), .Label = c("1",
"2", "3", "5", "12", "13", "14", "15"), class = "factor"), col2 = c(7L,
5L, 8L, 3L, 19L, 5L, 15L, 13L, 14L, 20L), col4 = structure(c(7L,
6L, 4L, 3L, 1L, 2L, 6L, 6L, 4L, 5L), .Label = c("1", "2", "5",
"6", "7", "8", "10"), class = "factor"), col5 = c(10L, 18L, 10L,
15L, 11L, 20L, 10L, 6L, 4L, 3L), col1 = c(10L, 5L, 6L, 3L, 15L,
7L, 19L, 20L, 15L, 10L), col2 = c(3L, 5L, 1L, 20L, 13L, 7L, 12L,
17L, 1L, 15L)), .Names = c("col1", "col2", "col3", "col2", "col4",
"col5", "col1", "col2"), row.names = c(NA, -10L), class = "data.frame")

Resources