Given a dataframe as follows:
structure(list(date = structure(1:24, .Label = c("2010Y1-01m",
"2010Y1-02m", "2010Y1-03m", "2010Y1-04m", "2010Y1-05m", "2010Y1-06m",
"2010Y1-07m", "2010Y1-08m", "2010Y1-09m", "2010Y1-10m", "2010Y1-11m",
"2010Y1-12m", "2011Y1-01m", "2011Y1-02m", "2011Y1-03m", "2011Y1-04m",
"2011Y1-05m", "2011Y1-06m", "2011Y1-07m", "2011Y1-08m", "2011Y1-09m",
"2011Y1-10m", "2011Y1-11m", "2011Y1-12m"), class = "factor"),
a = structure(c(1L, 18L, 19L, 20L, 22L, 23L, 2L, 4L, 5L,
7L, 8L, 10L, 1L, 21L, 3L, 6L, 9L, 11L, 12L, 13L, 14L, 15L,
16L, 17L), .Label = c("--", "10159.28", "10295.69", "10580.82",
"10995.65", "11245.84", "11327.23", "11621.99", "12046.63",
"12139.78", "12848.27", "13398.26", "13962.6", "14559.72",
"14982.58", "15518.64", "15949.87", "7363.45", "8237.71",
"8830.99", "9309.47", "9316.56", "9795.77"), class = "factor"),
b = structure(c(2L, 16L, 23L, 24L, 4L, 6L, 7L, 9L, 10L, 12L,
14L, 17L, 1L, 22L, 3L, 5L, 8L, 11L, 13L, 15L, 18L, 19L, 20L,
21L), .Label = c("-", "--", "1058.18", "1455.6", "1539.01",
"1867.07", "2036.92", "2102.23", "2372.84", "2693.96", "2769.65",
"2973.04", "3146.88", "3227.23", "3604.71", "365.07", "3678.01",
"4043.18", "4438.55", "4860.76", "5360.94", "555.51", "653.19",
"980.72"), class = "factor"), c = structure(c(2L, 6L, 10L,
11L, 13L, 15L, 16L, 18L, 20L, 22L, 24L, 7L, 1L, 9L, 12L,
14L, 17L, 19L, 21L, 23L, 3L, 4L, 5L, 8L), .Label = c("-",
"--", "1092.73", "1222.48", "1409.07", "158.18", "1748.44",
"2179.42", "227.68", "268.53", "331.81", "366.95", "434.19",
"486.41", "538.49", "606.62", "614.75", "651.46", "729.44",
"736.55", "836.46", "890.81", "929.72", "981.65"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
How could I replace -- and - in only columns a and b with NA? Thanks.
You can use :
cols <- c('a', 'b')
df[cols][df[cols] == '--' | df[cols] == '-'] <- NA
Or using dplyr :
library(dplyr)
df %>% mutate(across(c(a, b), ~replace(., . %in% c('--', '-'), NA)))
I think it's better to try to avoid the data being read in like this in the first place, but if you need to correct it after, you can try using the na.strings argument in type.convert. Notice that it's na.strings with an "s" -- it's plural, so more than one value can be used to represent NA values.
df[c("a", "b")] <- lapply(df[c("a", "b")], type.convert, na.strings = c("--", "-"))
str(df)
# 'data.frame': 24 obs. of 4 variables:
# $ date: Factor w/ 24 levels "2010Y1-01m","2010Y1-02m",..: 1 2 3 4 5 6 7 8 9 10 ...
# $ a : num NA 7363 8238 8831 9317 ...
# $ b : num NA 365 653 981 1456 ...
# $ c : Factor w/ 24 levels "-","--","1092.73",..: 2 6 10 11 13 15 16 18 20 22 ...
head(df)
# date a b c
# 1 2010Y1-01m NA NA --
# 2 2010Y1-02m 7363.45 365.07 158.18
# 3 2010Y1-03m 8237.71 653.19 268.53
# 4 2010Y1-04m 8830.99 980.72 331.81
# 5 2010Y1-05m 9316.56 1455.60 434.19
# 6 2010Y1-06m 9795.77 1867.07 538.49
Note that in this particular case, you could also use the side effect of as.numeric(as.character(...)) converting anything that can't be coerced to numeric to NA, but keep in mind that you will get a warning for each column that you use this approach on.
lapply(df[c("a", "b")], function(x) as.numeric(as.character(x)))
Related
I am trying to write a code that finds the 3 consecutives months that are the coldest.
For now I have written a code for the 3 first months (1,2,3) but then it should be applied to (4,5,6), (7,8,9), (10,11,12), (2,3,4), (5,6,7), (8,9,10), (11,12,1), (3,4,5), (6,7,8), (9,10,11) and (12,1,2) which are all the possible combinations of 3 consecutives months.
The code I wrote is here :
cold <- data_example %>%
group_by(Site) %>%
filter(Month %in% c(1,2,3)) %>%
mutate(mean_temperature = mean(t_q)) %>%
dplyr::select(-c(t_q,Month)) %>%
distinct(Site, mean_temperature)
average_temp_month_1_2_3 <- cold$mean_temperature
Then I replaced the c(1,2,3) by all possiblities, I have created a new column for each output.
I end up with a dataset with row corresponding to Site and columns are all the possibilities of 3 consecutive months.
After I took the min value for each row using the function apply() and min() and it gives me the coldest quarter for each Site.
I am looking for a way to generalize it, like creating a loop on the possiblities.
The structure of data_example is as follow :
structure(list(Site = c(4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L, 4L,
4L, 4L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 5L, 13L,
13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 13L, 14L, 14L,
14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 14L, 15L, 15L, 15L,
15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 15L, 16L, 16L, 16L, 16L,
16L, 16L, 16L, 16L, 16L, 16L, 16L, 16L, 17L, 17L, 17L, 17L, 17L,
17L, 17L, 17L, 17L, 17L, 17L, 17L, 18L, 18L, 18L, 18L, 18L, 18L,
18L, 18L, 18L, 18L, 18L, 18L, 25L, 25L, 25L, 25L, 25L, 25L, 25L,
25L, 25L, 25L, 25L, 25L, 26L, 26L, 26L, 26L, 26L, 26L, 26L, 26L,
26L, 26L, 26L, 26L), Month = c(1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L,
9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L,
12L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L,
3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 1L, 2L, 3L, 4L, 5L,
6L, 7L, 8L, 9L, 10L, 11L, 12L), t_q = c(9.67754848470332, -6.74555496540183,
5.67969761460384, 12.537207581471, -9.4899105618945, 21.0747672424502,
15.2643039243614, -3.62839910494421, 11.3919028351455, 1.69988257436554,
4.22015024307287, 11.7045830784212, 8.91437673833493, 0.579081429509138,
-10.8207481229903, 7.05356868592628, 13.0911580912516, 17.2032089167605,
-2.47642708849114, -11.2105599344486, 33.986736305027, 17.8578689773214,
-14.9114468266335, 14.4681380389141, 0.568074240873411, 7.65458408777801,
1.91368344556659, 6.01571556896127, 11.4858297513536, 2.2608458985328,
-2.08200762781776, 12.1540989284163, 20.9941815285413, 0.375777604316208,
-2.7137027317614, -6.17690210400591, 11.2549857164403, 17.447156776654,
-6.96565197389579, -5.41542361226991, 11.1680111873065, 16.2266522778922,
-11.4503938582433, 5.93300314835716, -18.2818398656237, 16.2930210946949,
9.80219192652316, -0.48237356523527, 7.72680942503686, 5.84113084181759,
9.66129413490096, -4.53018262186904, 7.42187509892118, 9.2559478576895,
8.25120948667013, 8.18182063263247, 16.3703081943971, 19.5469951420341,
3.71888263185773, -0.150179891749435, 1.32057298670562, -5.63556532224354,
21.3918542474341, 4.58752188336035, 5.49430262894033, 5.99587512047837,
-3.76459024109216, -8.53522098071824, 8.01805680562232, 26.2227490426066,
8.90822434139878, 5.04259034084471, 6.89740304247746, 11.9484584922927,
-11.5085102739471, 30.4526759119379, 21.878533782357, -5.39936677076962,
-9.83965056853816, 19.3083455159472, 7.90653548036154, 3.11876660277767,
-8.85027083180008, -9.9225496831988, 5.97307112581907, -2.83528336599284,
-2.75758002814396, 4.68388181004449, 6.61649031537118, -6.65988084338133,
-0.981075313384259, 5.84898952305179, -5.20962191660178, 0.416662319713158,
-10.5336993269853, 19.5350642296553, 26.9696625385792, 15.3291059661081,
15.0799591208354, 13.2310653499033, 7.2053382722482, -7.87288386491102,
20.8083797469715, 6.16664220270041, 8.3360949793043, -14.4000921795463,
-10.5503025782944, 14.3185205291177, 5.83802399796341, 2.49660818997943,
15.7399297014092, -0.834086173817971, 12.4883230222372, 6.73548467376379,
7.7988835803825, -5.13583355913738, 7.51054162811707, 11.6610602814336,
-11.8864185954223, 4.2704440943851)), row.names = c(NA, -120L
), groups = structure(list(Site = c(4L, 5L, 13L, 14L, 15L, 16L,
17L, 18L, 25L, 26L), .rows = structure(list(1:12, 13:24, 25:36,
37:48, 49:60, 61:72, 73:84, 85:96, 97:108, 109:120), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -10L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))
You can use raster::movingFun to do a moving average with circular data, then use slice_min to get the minimum value per group.
library(dplyr)
circ <- function(x, by) ifelse(x%%by == 0, by, x%%by)
df %>%
group_by(Site) %>%
mutate(rolmean = raster::movingFun(t_q, n = 3, fun = mean, circular = TRUE)) %>%
slice_min(rolmean) %>%
mutate(coldest = toString(circ(c(Month-1, Month, Month+1), 12)))
output
# A tibble: 10 × 5
# Groups: Site [10]
Site Month t_q rolmean coldest
<int> <int> <dbl> <dbl> <chr>
1 4 2 -6.75 2.87 1, 2, 3
2 5 3 -10.8 -1.06 2, 3, 4
3 13 11 -2.71 -2.84 10, 11, 12
4 14 8 5.93 -7.93 7, 8, 9
5 15 3 9.66 3.66 2, 3, 4
6 16 7 -3.76 -2.10 6, 7, 8
7 17 11 -8.85 -5.22 10, 11, 12
8 18 10 0.417 -5.11 9, 10, 11
9 25 10 -14.4 -5.54 9, 10, 11
10 26 12 4.27 -0.593 11, 12, 1
Using which.min in aggregate on a moving average window.
aggregate(t_q ~ Site, dat, \(s) {
win <- 3 ## window length
sq <- Map(seq, 1:(length(s) - win + 1), win:length(s))
toString(sq[[which.min(sapply(sq, \(sq) mean(s[sq])))]])
})
# Site t_q
# 1 4 1, 2, 3
# 2 5 2, 3, 4
# 3 13 10, 11, 12
# 4 14 7, 8, 9
# 5 15 2, 3, 4
# 6 16 6, 7, 8
# 7 17 10, 11, 12
# 8 18 9, 10, 11
# 9 25 9, 10, 11
# 10 26 10, 11, 12
I'm trying to find the shortest possible dplyr-purr combination.
Can I reduce the following statement which combines select_if() and map_df() ?
training.set.imputed %>%
select_if(~sum(is.na(.))>0) %>% map_df(~sum(is.na(.)))
I tried this:
training.set.imputed %>%
select_if(~sum(is.na(.))>0, .funs = ~sum(is.na(.)))
which throws this error:
Error: nm must be NULL or a character vector the same length as x
What does this mean?
Any ideas how to form the .funs term?
The .funs argument in select_if requires a renaming function, and not a mutating function, so you can do something like this with it, but you can't mutate the variable values:
tibble(blah = 1:2, bleh = 3:4, bluh = c(NA, NA)) %>%
select_if(~ sum(is.na(.x)) > 0, .funs = toupper)
#### OUTPUT ####
# A tibble: 2 x 1
BLUH
<lgl>
1 NA
2 NA
If you insist on using a combination of purrr and dplyr, then this is probably your best bet (Edit: I just noticed that G. Grothendieck gave this answer, but I'll include it anyway for the sake of completeness.):
df %>%
map_df(~ sum(is.na(.))) %>%
select_if(~ . > 0)
#### OUTPUT ####
# A tibble: 1 x 2
b d
<int> <int>
1 4 1
However, you can simplify it a little by just using dplyr's summarize_if:
df %>%
summarise_if(anyNA, ~ sum(is.na(.)))
#### OUTPUT ####
# A tibble: 1 x 2
b d
<int> <int>
1 4 1
Since you're really just after column sums, base R might offer the most concise option:
colSums(is.na(df)) %>%
.[. > 0]
#### OUTPUT ####
b d
4 1
Data
structure(list(a = c(2L, 2L, 5L, 10L, 10L, 18L, 18L, 19L, 11L,
14L, 12L, 10L, 4L, 16L, 5L, 5L, 11L, 2L, 14L, 7L), b = c(10L,
20L, 16L, NA, 6L, 1L, 11L, 12L, 12L, 12L, 8L, NA, NA, 8L, 11L,
19L, 8L, 9L, NA, 19L), c = c(11L, 11L, 20L, 8L, 15L, 4L, 17L,
4L, 4L, 11L, 20L, 11L, 6L, 12L, 17L, 7L, 14L, 18L, 15L, 19L),
d = c(19L, 16L, 17L, 14L, 8L, 19L, 7L, 6L, 6L, 13L, 7L, 19L,
11L, 17L, NA, 10L, 3L, 3L, 3L, 2L), e = c(12L, 17L, 14L,
5L, 8L, 19L, 8L, 3L, 17L, 1L, 2L, 6L, 5L, 17L, 14L, 5L, 8L,
2L, 8L, 2L)), row.names = c(NA, -20L), class = c("tbl_df",
"tbl", "data.frame"))
I assume you want the number of NAs in each column keeping only columns that have at least 1 NA.
!) This avoids the code duplication and does not give an error. First calculate the number of NAs in each column and then pick out the columns that are greater than 0.
# test input - BOD comes with R
BOD[1,2] <- NA
BOD %>%
map_df(~ sum(is.na(.))) %>%
select_if(~ . > 0)
giving:
# A tibble: 1 x 1
demand
<int>
1 1
2) This first selects out those columns with at least one NA and then finds the number of NAs in those columns giving the same result:
BOD %>%
select_if(anyNA) %>%
map_df(~ sum(is.na(.)))
This question already has answers here:
Find out the number of days of a month in R
(15 answers)
Closed 3 years ago.
I have column - Month, and there are few months, I need to find out the number of day in that month.
I need to find the number of days in a month considering the leap year and that the month can go to June 2019, or current month. The logic should be applied on the column.
MONTH no_of_days
Jan,2017
Feb,2017
Mar,2017
Apr,2017
May,2017
Jun,2017
Jul,2017
Aug,2017
Sep,2017
Oct,2017
Nov,2017
Dec,2017
structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L,
11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L,
4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017",
"Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017",
"Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017",
"Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017",
"Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA,
-24L))
You need to first get the column MONTH to Date and then use any of the function shown here to get number of days in a month.
as.Date(paste0(1, df$MONTH), "%d%b,%Y")
#[1] "2017-01-01" "2017-02-01" "2017-03-01" "2017-04-01" "2017-05-01" "2017-06-01"
# "2017-07-01" "2017-08-01" "2017-09-01" "2017-10-01" "2017-11-01" "2017-12-01"
Hmisc::monthDays(as.Date(paste0(1, df$MONTH), "%d%b,%Y"))
#[1] 31 28 31 30 31 30 31 31 30 31 30 31
data
df <- structure(list(MONTH = structure(c(5L, 4L, 8L, 1L, 9L, 7L, 6L,
2L, 12L, 11L, 10L, 3L), .Label = c("Apr,2017", "Aug,2017", "Dec,2017",
"Feb,2017", "Jan,2017", "Jul,2017", "Jun,2017", "Mar,2017", "May,2017",
"Nov,2017", "Oct,2017", "Sep,2017"), class = "factor")), class =
"data.frame", row.names = c(NA, -12L))
You'll need to wrangle your MONTH column into a date object, then you can use lubridate::days_in_month():
if (!require(lubridate)) install.packages('lubridate')
if (!require(dplyr)) install.packages('dplyr')
library(lubridate)
library(dplyr)
dat %>%
mutate(MONTH = paste("01", MONTH, sep = ",") %>% dmy(),
no_of_days = days_in_month(MONTH))
# A tibble: 12 x 2
MONTH no_of_days
<date> <int>
1 2017-01-01 31
2 2017-02-01 28
3 2017-03-01 31
4 2017-04-01 30
5 2017-05-01 31
6 2017-06-01 30
7 2017-07-01 31
8 2017-08-01 31
9 2017-09-01 30
10 2017-10-01 31
11 2017-11-01 30
12 2017-12-01 31
The package lubridate has a function called days_in_month(). So you can first convert the MONTH column into dates and then use days_in_month:
library(lubridate)
library(zoo)
df <- structure(list(MONTH = structure(c(9L, 7L, 15L, 1L, 17L, 13L, 11L, 3L, 23L, 21L, 19L, 5L, 10L, 8L, 16L, 2L, 18L, 14L, 12L, 4L, 24L, 22L, 20L, 6L), .Label = c("Apr,2017", "Apr,2018", "Aug,2017", "Aug,2018", "Dec,2017", "Dec,2018", "Feb,2017", "Feb,2018", "Jan,2017", "Jan,2018", "Jul,2017", "Jul,2018", "Jun,2017", "Jun,2018", "Mar,2017", "Mar,2018", "May,2017", "May,2018", "Nov,2017", "Nov,2018", "Oct,2017", "Oct,2018", "Sep,2017", "Sep,2018"), class = "factor")), class = "data.frame", row.names = c(NA, -24L))
df$no_of_days <- days_in_month(as.Date(as.yearmon(df$MONTH, "%b,%Y")))
my df is shown below
mydf<- structure(list(IDs = c(11L, 16L, 19L, 21L, 22L, 24L, 42L, 43L,
51L), string1 = structure(c(1L, 8L, 7L, 2L, 4L, 9L, 6L, 3L, 5L
), .Label = c("b", "g", "hue", "hyu", "if", "jud", "ufhy", "uhgf;ffugf",
"uhgs"), class = "factor"), IDs.1 = c(4L, 11L, 16L, 19L, 20L,
22L, 29L, NA, NA), string2 = structure(c(2L, 3L, 8L, 7L, 4L,
5L, 6L, 1L, 1L), .Label = c("", "a", "b", "higf;hdugd", "hyu",
"inja", "ufhy", "uhgf;ffugf"), class = "factor")), .Names = c("IDs",
"string1", "IDs.1", "string2"), class = "data.frame", row.names = c(NA,
-9L))
I want to get them together like below
myout<- structure(list(Ids = c(4L, 11L, 16L, 19L, 20L, 21L, 22L, 24L,
29L, 42L, 43L, 51L), string = structure(c(1L, 2L, 11L, 10L, 4L,
3L, 6L, 12L, 8L, 9L, 5L, 7L), .Label = c("a", "b", "g", "higf;hdugd",
"hue", "hyu", "if", "inja", "jud", "ufhy", "uhgf;ffugf", "uhgs"
), class = "factor")), .Names = c("Ids", "string"), class = "data.frame", row.names = c(NA,
-12L))
I tried to do it using merge
df1 <- mydf[,1:2]
df2 <- mydf[,3:4]
df3 = merge(df1, df2, by.x=c("IDs", "string"))
which gives me an error because they are unequal
I also tried to use the approach given here
How to join (merge) data frames (inner, outer, left, right)? which did not solve my problem
my input is like this
IDs string1 IDs string2
11 b 4 a
16 uhgf;ffugf 11 b
19 ufhy 16 uhgf;ffugf
21 g 19 ufhy
22 hyu 20 higf;hdugd
24 uhgs 22 hyu
42 jud 29 inja
43 hue
51 if
and the output looks like this
Ids string
4 a
11 b
16 uhgf;ffugf
19 ufhy
20 higf;hdugd
21 g
22 hyu
24 uhgs
29 inja
42 jud
43 hue
51 if
e.g. 11, 16 etc are repeated twice , so we only want them once
We can do an rbind and remove the duplicated elements
library(data.table)
setnames(rbindlist(list(mydf[3:4], mydf[1:2]))[!is.na(IDs.1)&!duplicated(IDs.1)],
c("Ids", "string"))[order(Ids)]
# Ids string
# 1: 4 a
# 2: 11 b
# 3: 16 uhgf;ffugf
# 4: 19 ufhy
# 5: 20 higf;hdugd
# 6: 21 g
# 7: 22 hyu
# 8: 24 uhgs
# 9: 29 inja
#10: 42 jud
#11: 43 hue
#12: 51 if
Or another option is melt from data.table (to convert to 'long' format) which can take multiple measure patterns, then remove the duplicated 'Ids' and order using 'Ids'.
melt(setDT(mydf), measure = patterns("ID", "string"), na.rm=TRUE,
value.name = c("Ids", "string"))[!duplicated(Ids, fromLast=TRUE)
][, variable := NULL][order(Ids)]
This is the function:
remove_column <- function(column_vector) {
for (column in column_vector) {
if (grepl('.y$', column)) {
mydata$column <- NULL
}
}
}
What I think it'd doing: I'm passing a vector of my column names to the function, it's looping through list of names and asking whether the last characters of each column name are ".y". If that is the case, the function eliminates the column.
I've tried putting prints here and there to see my vector and to see whether the conditional evaluates to TRUE or FALSE, and everything seems to be working fine, but for some reason, it doesn't get rid of the column.
The following function returns my column vector:
duplicate_names <- function(col_names) {
duplicates <- c()
for (name in col_names) {
# split by period i.e. colname.x would be [colname, x]
if (lengths(strsplit(name, '\\.')) > 1) {
duplicates <- c(duplicates, name)
}
}
return(duplicates)
}
I usually call it like this:
duplicate_names(names(mydata))
This is what the vector of columns looks like:
c('v1.x', 'v2.y')
When I print the function it returns the following:
[1] "v1.x" "v2.y"
As requested by a user, the dput(droplevels(horsedata[1:5, 1:5])) (data that I am using for this):
dput(droplevels(horsedata[1:5, 1:5]))
structure(list(ÿþhorse_name = structure(c(3L, 1L, 2L, 4L, 5L), .Label = c("IM PRETTY FAMES",
"JESS ROYAL BUCKS", "KISS ME IM SUGAR", "LOLAMO", "RUN MADISON RUN"
), class = "factor"), owner_name = structure(c(3L, 2L, 1L, 5L,
4L), .Label = c("Christine Tavares", "Heste Sport, Inc.", "Picov Cattle Co.",
"Procter, Wayne and Carol", "Ruth F. Barbour"), class = "factor"),
program = structure(1:5, .Label = c("1", "2", "3", "4", "5"
), class = "factor"), pp = 1:5, todays_cls = c(61L, 61L,
61L, 61L, 61L)), .Names = c("ÿþhorse_name", "owner_name",
"program", "pp", "todays_cls"), row.names = c(NA, 5L), class = "data.frame")
We don't need a loop to subset the columns.
mydata[!grepl('\\.y$', column_list)]
If there are other columns not in the column_list and we want to keep them (assuming that the 'column_list' is ordered)
mydata[setdiff(1:ncol(mydata), grep('\\.y$', column_list))]
We can modify your function by
changing .y$ to \\.y$ as . means any character and not just the dot
Instead of $, we use [ to subset the dataset
Return the dataset after the assignment
remove_column <- function(dat, column_vec) {
for (column in column_vec) {
if (grepl('\\.y$', column, perl=TRUE)) {
dat[column] <- NULL
}
}
dat
}
remove_column(mydata, column_list)
# v1.x v2.x v3
#1 6 1 9
#2 4 11 7
#3 14 15 5
#4 10 2 4
#5 13 4 0
#6 19 14 1
#7 5 1 8
#8 16 12 7
#9 16 13 5
#10 5 0 7
data
mydata <- structure(list(v1.x = c(6L, 4L, 14L, 10L, 13L, 19L, 5L, 16L,
16L, 5L), v1.y = c(12L, 7L, 14L, 14L, 6L, 18L, 4L, 0L, 10L, 2L
), v2.x = c(1L, 11L, 15L, 2L, 4L, 14L, 1L, 12L, 13L, 0L), v2.y = c(6L,
5L, 7L, 3L, 19L, 4L, 15L, 13L, 14L, 20L), v3 = c(9L, 7L, 5L,
4L, 0L, 1L, 8L, 7L, 5L, 7L)), .Names = c("v1.x", "v1.y", "v2.x",
"v2.y", "v3"), row.names = c(NA, -10L), class = "data.frame")
column_list <- c('v1.x', 'v1.y', 'v2.x', 'v2.y')