In dplyr group_by() + summarise(sum)is not working - r

This is my code:
df <- structure(list(NOME = c("JOGADOR 1", "JOGADOR 1", "JOGADOR 6",
"JOGADOR 6", "JOGADOR 5", "JOGADOR 5", "JOGADOR 3", "JUGADOR 3",
"JOGADOR 9", "JOGADOR 9", "JOGADOR 7", "JOGADOR 7", "JOGADOR 8",
"JOGADOR 8", "JOGADOR 10", "JOGADOR 10", "JOGADOR 4", "JOGADOR 4",
"JOGADOR 2", "JOGADOR 2", "JOGADOR 12", "JOGADOR 11", "JOGADOR 13"
), TOTAL_MINUTES = c(48.15, 43, 48.15, 51.9333333333333, 48.15,
51.9333333333333, 48.15, 51.9333333333333, 48.15, 25, 48.15,
51.9333333333333, 48.15, 29, 48.15, 42, 48.15, 51.9333333333333,
48.15, 51.9333333333333, 17, 26, 9), TOTAL.DISTANCE = c(5264L,
3999L, 5242L, 5589L, 5684L, 5966L, 4833L, 5012L, 5013L, 2653L,
5452L, 5691L, 5041L, 3775L, 5266L, 4321L, 4795L, 4924L, 5209L,
5242L, 2085L, 2703L, 1282L)), row.names = c(NA, -23L), class = c("tbl_df",
"tbl", "data.frame"))
Its a simple task but its not working:
df %>%group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE),sum())
It just reapting the NOME column values. Its not summing and giving one line per "JOGADOR X".
Why? Any help?

The across was closed without the sum. Also, if we are not providing any lambda expression, we don't use sum()
library(dplyr)
df %>%
group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE, sum,
na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 14 × 3
NOME TOTAL_MINUTES TOTAL.DISTANCE
<chr> <dbl> <int>
1 JOGADOR 1 91.2 9263
2 JOGADOR 10 90.2 9587
3 JOGADOR 11 26 2703
4 JOGADOR 12 17 2085
5 JOGADOR 13 9 1282
6 JOGADOR 2 100. 10451
7 JOGADOR 3 48.2 4833
8 JOGADOR 4 100. 9719
9 JOGADOR 5 100. 11650
10 JOGADOR 6 100. 10831
11 JOGADOR 7 100. 11143
12 JOGADOR 8 77.2 8816
13 JOGADOR 9 73.2 7666
14 JUGADOR 3 51.9 5012
Or using lambda expression
df %>%
group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE, ~sum(.x,
na.rm = TRUE)), .groups = 'drop')

Related

Pivot Wider causing issues when as.yearmon is used

I have the following code:
library(zoo)
library(xts)
df1<-structure(list(Date = structure(c(13523, 13532, 13539, 13551,
13565, 13567, 13579, 13588, 13600, 13607, 13616, 13628, 13637,
13656, 13658, 13670, 13686, 13691, 13698, 13705, 13721, 13735,
13768, 13770, 13783, 13789, 13797, 13811, 13819, 13824, 13838,
13846, 13852, 13860), class = "Date"), Category = c("Type 1",
"Type 2", "Type 1", "Type 1", "Type 1", "Type 2", "Type 1", "Type 3",
"Type 1", "Type 1", "Type 2", "Type 1", "Type 1", "Type 1", "Type 2",
"Type 1", "Type 3", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2",
"Type 1", "Type 3", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2",
"Type 1", "Type 1", "Type 1", "Type 3", "Type 2"), Value = c(2250,
1200, 625, 2250, 1000, 2750, 2250, 2750, 950, 2000, 1100, 950,
2250, 1000, 2500, 2250, 2500, 1000, 2250, 1200, 700, 2500, 2000,
2500, 900, 2250, 1200, 925, 2500, 2250, 750, 2000, 2500, 950)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -34L), groups = structure(list(
Date = structure(c(13523, 13532, 13539, 13551, 13565, 13567,
13579, 13588, 13600, 13607, 13616, 13628, 13637, 13656, 13658,
13670, 13686, 13691, 13698, 13705, 13721, 13735, 13768, 13770,
13783, 13789, 13797, 13811, 13819, 13824, 13838, 13846, 13852,
13860), class = "Date"), .rows = structure(list(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -34L), .drop = TRUE))
I've created a rolling_sum by month for this particular dataset using:
df_month <- df1 %>%
group_by(Category, Month = format(Date, "%Y-%m-%d")) %>%
summarize(Rolling_Sum = sum(Value))
df_month$Month <- as.yearmon(df_month$Month)
In preparation for a conversion to an xts format I'd like to pivot-wider and replace all null/NAs values with 0. However the pivot-wider seems to break the dataset, making the null replacement and xts conversion impossible:
df_turned <- df_month %>% group_by(Category) %>% pivot_wider(names_from = Category, values_from = Rolling_Sum, id_cols = Month)
If that had worked, I would have done:
df_turned <- df_turned %>% replace(.=="NULL", 0)
Then:
df_turned <- xts(df_turned, order.by = df_turned$Month)
Any advice most appreciated.
If we don't want duplicates, then use values_fn
library(tidyr)
library(dplyr)
df_turned <- df_month %>%
ungroup %>%
pivot_wider(names_from = Category, values_from = Rolling_Sum,
values_fn = sum, values_fill = 0)
-output
df_turned
# A tibble: 12 × 4
Month `Type 1` `Type 2` `Type 3`
<yearmon> <dbl> <dbl> <dbl>
1 Jan 2007 2875 1200 0
2 Feb 2007 3250 2750 0
3 Mar 2007 3200 0 2750
4 Apr 2007 2950 1100 0
5 May 2007 3250 2500 0
6 Jun 2007 3250 0 2500
7 Jul 2007 4150 0 0
8 Sep 2007 2900 0 2500
9 Oct 2007 4375 0 0
10 Nov 2007 5000 2500 0
11 Aug 2007 0 2500 0
12 Dec 2007 0 950 2500
Now, we can convert to xts
xts(df_turned[-1], order.by = df_turned$Month)
Type 1 Type 2 Type 3
Jan 2007 2875 1200 0
Feb 2007 3250 2750 0
Mar 2007 3200 0 2750
Apr 2007 2950 1100 0
May 2007 3250 2500 0
Jun 2007 3250 0 2500
Jul 2007 4150 0 0
Aug 2007 0 2500 0
Sep 2007 2900 0 2500
Oct 2007 4375 0 0
Nov 2007 5000 2500 0
Dec 2007 0 950 2500
As indicated in my comment, your problem is that you create duplicates because as.yearmon is called after the grouping by "Month". You are de facto grouping by "Date". We could do:
library(dplyr)
library(tidyr)
df1 |>
group_by(Category,
Month = as.yearmon(Date)) |>
pivot_wider(names_from = Category,
values_from = Value,
values_fn = sum,
values_fill = 0
) |>
select(-Date) # Or mutate "Date" above instead of creating "Month".
Then call xts.
Month = as.yearmon(Date) shouldn't cause a problem if Date is a date-type. However, if it is causing trouble as you indicate in your comment, as.yearmon(format(Date, "%Y-%m-%d")).
Output:
# A tibble: 12 × 4
Month `Type 1` `Type 2` `Type 3`
<yearmon> <dbl> <dbl> <dbl>
1 Jan 2007 2875 1200 0
2 Feb 2007 3250 2750 0
3 Mar 2007 3200 0 2750
4 Apr 2007 2950 1100 0
5 May 2007 3250 2500 0
6 Jun 2007 3250 0 2500
7 Jul 2007 4150 0 0
8 Sep 2007 2900 0 2500
9 Oct 2007 4375 0 0
10 Nov 2007 5000 2500 0
11 Aug 2007 0 2500 0
12 Dec 2007 0 950 2500
Update After #akrun updated answer with a similar solution, my solution seems more verbose. The reason is that my approach works directly on the df1 object and solves the problem there.
Use read.zoo like this:
library(zoo)
df_month |>
read.zoo(index = "Month", split = "Category", aggregate = sum) |>
na.fill(0)
giving this zoo object -- as.xts can be used to convert that to xts if needed.
Type 1 Type 2 Type 3
Jan 2007 2875 1200 0
Feb 2007 3250 2750 0
Mar 2007 3200 0 2750
Apr 2007 2950 1100 0
May 2007 3250 2500 0
Jun 2007 3250 0 2500
Jul 2007 4150 0 0
Aug 2007 0 2500 0
Sep 2007 2900 0 2500
Oct 2007 4375 0 0
Nov 2007 5000 2500 0
Dec 2007 0 950 2500
or directly from df1 modified from the comment below
df1 |>
read.zoo(df1, FUN = as.yearmon, split = "Category", aggregate = sum) |>
na.fill(0)
Note
df_month from question in immediately reproducible form
df_month <-
structure(list(Category = c("Type 1", "Type 1", "Type 1", "Type 1",
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1",
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1",
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2", "Type 2",
"Type 2", "Type 2", "Type 2", "Type 2", "Type 2", "Type 3", "Type 3",
"Type 3", "Type 3"), Month = structure(c(2007, 2007, 2007.08333333333,
2007.08333333333, 2007.16666666667, 2007.16666666667, 2007.25,
2007.25, 2007.33333333333, 2007.33333333333, 2007.41666666667,
2007.41666666667, 2007.5, 2007.5, 2007.5, 2007.66666666667, 2007.66666666667,
2007.75, 2007.75, 2007.75, 2007.83333333333, 2007.83333333333,
2007.83333333333, 2007, 2007.08333333333, 2007.25, 2007.33333333333,
2007.58333333333, 2007.83333333333, 2007.91666666667, 2007.16666666667,
2007.41666666667, 2007.66666666667, 2007.91666666667), class = "yearmon"),
Rolling_Sum = c(2250, 625, 2250, 1000, 2250, 950, 2000, 950,
2250, 1000, 2250, 1000, 2250, 1200, 700, 2000, 900, 2250,
1200, 925, 2250, 750, 2000, 1200, 2750, 1100, 2500, 2500,
2500, 950, 2750, 2500, 2500, 2500)), row.names = c(NA, -34L
), groups = structure(list(Category = c("Type 1", "Type 2", "Type 3"
), .rows = structure(list(1:23, 24:30, 31:34), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

Is there a way to plot correlation heatmap between two dataframes in R? The two dataframes have different row names and are of unequal dimesions

I have two different dataframes as shown in figures attached. Dataframe1 and Dataframe2.
This is what I tried.
#First dataframe
structure(list(Label = c("Gene 1", "Gene 2", "Gene 3", "Gene 4",
"Gene 5", "Gene 6", "Gene 7", "Gene 8", "Gene 9", "Gene 10",
"Gene 11", "Gene 12", "Gene 13", "Gene 14", "Gene 15", "Gene 16",
"Gene 17", "Gene 18", "Gene 19", "Gene 20", "Gene 21", "Gene 22",
"Gene 23", "Gene 24", "Gene 25", "Gene 26", "Gene 27", "Gene 28",
"Gene 29", "Gene 30"), Count = c(1500, 1600, 1700, 1800, 1900,
2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000,
3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100,
4200, 4300, 4400)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-30L))
df_1 <- read_excel("Demo_data.xlsx", sheet = "Dataframe1")
str(df_1)
View(df_1)
df_1.1 <- column_to_rownames(df_1, 'Label')
View(df_1.1)
df_1.2 <- t(df_1.1)
View(df_1.2)
df_1.2 <- as.data.frame(df_1.2)
str(df_1.2)
typeof(dff1)
str(dff1)
#Second dataframe
structure(list(Label = c("Control1", "Control2", "Control3",
"Control4", "Control5", "Control6", "Control7", "Control8", "Control9",
"Control10", "Control11", "Control12", "Control13", "Control14",
"Control15", "Control16", "Control17", "Control18", "Control19",
"Control20", "Control21", "Control22", "Control23", "Control24"
), Count = c(1800, 1400, 1110, 1900, 2500, 2900, 2100, 900, 5000,
2300, 700, 1400, 3400, 2310, 3322, 2200, 4400, 2100, 1000, 6700,
4300, 2120, 4800, 4300)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -24L))
df_2 <- read_excel("Demo_data.xlsx", sheet = "Dataframe2")
df_2.1 <- column_to_rownames(df_2, 'Label')
View(df_2.1)
df_2.1 <- t(df_2.1)
View(df_2.1)
df_2.1 <- as.data.frame(df_2.1)
str(df_2.1)
correlation <- cor(df_1.2, df_2.1)
View(correlation)
This is my desired output but I am getting NA for every correlation. Any help is highly appreciated.
Desired output (without NA)
As it is written in the comments, what you are trying to achieve is rather unclear.
If you want to compute the correlation between the Count column in each dataframe and visualize it using a scatterplot, you can use the following code:
library(tidyverse)
df_1 <- structure(list(Label = c("Gene 1", "Gene 2", "Gene 3", "Gene 4",
"Gene 5", "Gene 6", "Gene 7", "Gene 8", "Gene 9", "Gene 10",
"Gene 11", "Gene 12", "Gene 13", "Gene 14", "Gene 15", "Gene 16",
"Gene 17", "Gene 18", "Gene 19", "Gene 20", "Gene 21", "Gene 22",
"Gene 23", "Gene 24", "Gene 25", "Gene 26", "Gene 27", "Gene 28",
"Gene 29", "Gene 30"),
Count = c(1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500,
2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600,
3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -30L))
df_2 <- structure(list(Label = c("Control1", "Control2", "Control3",
"Control4", "Control5", "Control6", "Control7", "Control8", "Control9",
"Control10", "Control11", "Control12", "Control13", "Control14",
"Control15", "Control16", "Control17", "Control18", "Control19",
"Control20", "Control21", "Control22", "Control23", "Control24"),
Count = c(1800, 1400, 1110, 1900, 2500, 2900, 2100, 900, 5000, 2300, 700, 1400,
3400, 2310, 3322, 2200, 4400, 2100, 1000, 6700, 4300, 2120, 4800, 4300)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -24L))
dat = left_join(
df_1 %>% mutate(id=str_extract(Label, "\\d+")),
df_2 %>% mutate(id=str_extract(Label, "\\d+")),
by="id", suffix=c("_gene", "_ctl")
)
dat
#> # A tibble: 30 x 5
#> Label_gene Count_gene id Label_ctl Count_ctl
#> <chr> <dbl> <chr> <chr> <dbl>
#> 1 Gene 1 1500 1 Control1 1800
#> 2 Gene 2 1600 2 Control2 1400
#> 3 Gene 3 1700 3 Control3 1110
#> 4 Gene 4 1800 4 Control4 1900
#> 5 Gene 5 1900 5 Control5 2500
#> 6 Gene 6 2000 6 Control6 2900
#> 7 Gene 7 2100 7 Control7 2100
#> 8 Gene 8 2200 8 Control8 900
#> 9 Gene 9 2300 9 Control9 5000
#> 10 Gene 10 2400 10 Control10 2300
#> # ... with 20 more rows
cor(dat$Count_gene, dat$Count_ctl, use="pairwise.complete.obs")
#> [1] 0.5047392
ggplot(dat, aes(x=Count_gene, y=Count_ctl)) +
geom_point()
#> Warning: Removed 6 rows containing missing values (`geom_point()`).
Created on 2022-12-12 with reprex v2.0.2
Basically, I extracted the id as the last digits of the label, then used left_join() to merge the dataframes.
This might look overly complicated but it is always a good idea to keep your data tidy in one dataframe.
Note that in your example, df_2 stops at id==24 so the correlation is computed on the 24 complete observations only.
However, a correlation is computed across 2 vectors, so in order to have a heatmap you would need a set of many vectors, which you don't seem to have.
For your next questions, it would be great if you use the reprex package as I did in this answer.

Renaming labels of a factor in R

I have census data of Male and Female populations organizaed by age group:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
if (!file.exists("./datafiles/cc-est2018-alldata-54.csv"))
download.file(url, destfile = "./datafiles/cc-est2018-alldata-54.csv", mode = "wb")
popSample <- read.csv("./datafiles/cc-est2018-alldata-54.csv") %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
popSample$AGEGRP <- as.factor(popSample$AGEGRP)
I then plot the Male and Female population relationships, faceted by age group (1-18, which is currently treated as a int
g <- ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups", x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
g
Which results in this plot: https://share.getcloudapp.com/v1ur6O4e
The problem: I am trying to convert the column AGEGRP from ‘int’ to ‘factor’, and change the factors labels from “1”, “2”, “3”, … “18” to "AgeGroup1", "AgeGroup2", "AgeGroup3", … "AgeGroup18"
When I try this code, my AGEGRP column's observation values are all replaced with NAs:popSample$AGEGRP <- factor(popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
https://share.getcloudapp.com/qGuo1O4y
Thank you for your help,
popSample$AGEGRP <- factor( popSample$AGEGRP, levels = c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+"))
Need to add all levels though.
Alternatively
levels(popSample$AGEGRP) <- c("0 to 4", "5 to 9", "10 to 14", "15 to 19", "20 to 24", "25 to 29", "30 to 34", "35 to 39", "40 to 44", "45 to 49", "50 to 54", "55 to 59", "60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
should work as well.
Read in the csv again:
library(tidyverse)
url <- "https://www2.census.gov/programs-surveys/popest/datasets/2010-2018/counties/asrh/cc-est2018-alldata-54.csv"
popSample <- read.csv(url) %>%
filter(AGEGRP != 0 & YEAR == 1) %>%
select("STNAME", "CTYNAME", "AGEGRP", "TOT_POP", "TOT_MALE", "TOT_FEMALE")
If you just want to add a prefix "AgeGroup" to your facet labels, you do:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP,labeller=labeller(AGEGRP = function(i)paste0("AgeGroup",i))) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
If there is a need for new factors, then you need to refactor (like #Annet's answer below):
lvls = c("0 to 4", "5 to 9", "10 to 14", "15 to 19",
"20 to 24", "25 to 29", "30 to 34", "35 to 39",
"40 to 44", "45 to 49", "50 to 54", "55 to 59",
"60 to 64", "65 to 69", "70 to 74", "75 to 79", "80 to 84", "85+")
#because you have factorize it
# if you can read the csv again, skip the factorization
popSample$AGEGRP = factor(lvls[popSample$AGEGRP],levels=lvls)
Then plot:
ggplot(popSample, aes(x=TOT_MALE, y=TOT_FEMALE)) +
geom_point(alpha = 0.5, colour="darkblue") +
scale_x_log10() +
scale_y_log10() +
facet_wrap(~AGEGRP) +
stat_smooth(method = "lm", col = "darkred", size=.75) +
labs(title = "F vs. M Population across all Age Groups",
x = "Total Male (log10)", y = "Total Female (log10)") +
theme_light()
To change all the factor labels with one function, you can use forcats::fct_relabel (forcats ships as part of the tidyverse, which you've already got loaded). The changed factor labels will carry over to the plot facets and the order stays the same.
First few entries:
# before relabelling
popSample$AGEGRP[1:4]
#> [1] 1 2 3 4
#> Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
# after relabelling
forcats::fct_relabel(popSample$AGEGRP, ~paste0("AgeGroup", .))[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18
Or with base R, reassign the levels:
levels(popSample$AGEGRP) <- paste0("AgeGroup", levels(popSample$AGEGRP))
popSample$AGEGRP[1:4]
#> [1] AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4
#> 18 Levels: AgeGroup1 AgeGroup2 AgeGroup3 AgeGroup4 AgeGroup5 ... AgeGroup18

R select certain values from multiple columns using conditions

I want to select certain values from multiple columns using conditions.(also let assign row 1 as ID#1, ... row5 as ID#5)
column1 <- c("rice 2", "apple 4", "melon 6", "blueberry 4", "orange 6")
column2 <- c("rice 8", "blueberry 8", "grape 10", "water 10", "mango 3")
column3 <- c("rice 6", "apple 8", "blueberry 12", "pineapple 8", "mango 3")
I want to get new column using IDs with condition only rice > 5, blueberry > 7 or orange > 5
First, I would like to get ID#1, ID#2, ID#3, ID#5
Second, I would to count how many conditions met per ID
I would like to get results
ID#1 -> 2 conditions met
ID#2 -> 1 conditions met
ID#3 -> 1 conditions met
ID#4 -> 0 conditions met
ID#5 -> 1 conditions met
If I understood the question correctly then one of the approach could be
library(dplyr)
cols <- names(df)[-1]
df1 <- df %>%
mutate_if(is.factor, as.character) %>%
mutate(rice_gt_5 = (select(., one_of(cols)) %>%
rowwise() %>%
mutate_all(funs(strsplit(., split=" ")[[1]][1] =='rice' & as.numeric(strsplit(., split=" ")[[1]][2]) > 5)) %>%
rowSums)) %>%
mutate(blueberry_gt_7 = (select(., one_of(cols)) %>%
rowwise() %>%
mutate_all(funs(strsplit(., split=" ")[[1]][1] =='blueberry' & as.numeric(strsplit(., split=" ")[[1]][2]) > 7)) %>%
rowSums)) %>%
mutate(orange_gt_5 = (select(., one_of(cols)) %>%
rowwise() %>%
mutate_all(funs(strsplit(., split=" ")[[1]][1] =='orange' & as.numeric(strsplit(., split=" ")[[1]][2]) > 5)) %>%
rowSums))
#IDs which satisfy at least one of your conditions i.e. rice > 5 OR blueberry > 7 OR orange > 5
df1$ID[which(df1 %>% select(rice_gt_5, blueberry_gt_7, orange_gt_5) %>% rowSums() >0)]
#[1] 1 2 3 5
#How many conditions are met per ID
df1 %>%
mutate(no_of_cond_met = rowSums(select(., one_of(c("rice_gt_5", "blueberry_gt_7", "orange_gt_5"))))) %>%
select(ID, no_of_cond_met)
# ID no_of_cond_met
#1 1 2
#2 2 1
#3 3 1
#4 4 0
#5 5 1
Sample data:
df <- structure(list(ID = 1:5, column1 = structure(c(5L, 1L, 3L, 2L,
4L), .Label = c("apple 4", "blueberry 4", "melon 6", "orange 6",
"rice 2"), class = "factor"), column2 = structure(c(4L, 1L, 2L,
5L, 3L), .Label = c("blueberry 8", "grape 10", "mango 3", "rice 8",
"water 10"), class = "factor"), column3 = structure(c(5L, 1L,
2L, 4L, 3L), .Label = c("apple 8", "blueberry 12", "mango 3",
"pineapple 8", "rice 6"), class = "factor")), .Names = c("ID",
"column1", "column2", "column3"), row.names = c(NA, -5L), class = "data.frame")

Convert a date range to Date type in R

This vector of date ranges is included in a dataframe of mine with class 'character'. The formats vary depending on whether the date range crosses into a different month:
dput(pollingdata$dates)
c("Nov. 1-7", "Nov. 1-7", "Oct. 24-Nov. 6", "Oct. 4-Nov. 6",
"Oct. 30-Nov. 6", "Oct. 25-31", "Oct. 7-27", "Oct. 21-Nov. 3",
"Oct. 20-24", "Jul. 19", "Oct. 29-Nov. 4", "Oct. 28-Nov. 3",
"Oct. 27-Nov. 2", "Oct. 20-28", "Sep. 30-Oct. 20", "Oct. 15-19",
"Oct. 26-Nov. 1", "Oct. 25-31", "Oct. 24-30", "Oct. 18-26",
"Oct. 10-14", "Oct. 4-9", "Sep. 23-Oct. 6", "Sep. 16-29", "Sep. 2-22",
"Oct. 21-Nov. 2", "Oct. 17-25", "Sep. 30-Oct. 13", "Sep. 27-Oct. 3",
"Sep. 21-26", "Sep. 14-20", "Aug. 26-Sep. 15", "Sep. 7-13",
"Aug. 19-Sep. 8", "Aug. 31-Sep. 6", "Aug. 12-Sep. 1", "Aug. 9-Sep. 1",
"Aug. 24-30", "Aug. 5-25", "Aug. 17-23", "Jul. 29-Aug. 18",
"Aug. 10-16", "Jan. 12")
I would like to convert this vector into two separate columns in my dataframe, 1. startdate and 2. enddate, for the beginning and end of the range. Both columns should be saved as class 'Date', this will make it easier for me to use the data in my project. Does anyone know an easy way to do this manipulation? I have been struggling with it.
Thanks in advance,
We can split the vector by - into a list, replace the elements that have only numbers at the end by pasteing the month substring, append NA for those having less than 2 elements using (length<-) and convert to data.frame (with do.call(rbind.data.frame)
lst <- lapply(strsplit(v1, "-"), function(x) {
i1 <- grepl("^[0-9]+", x[length(x)])
if(i1) {
x[length(x)] <- paste(substr(x[1], 1, 4), x[length(x)])
x} else x})
d1 <- do.call(rbind.data.frame, lapply(lst, `length<-`, max(lengths(lst))))
colnames(d1) <- c("Start_Date", "End_Date")
As per the OP's post, we need to convert to Date class, but Date class follows the format of %Y-%m-%d. In the vector, there is no year, not sure we can paste the current year and convert to Date class. If that is permissible, then
d1[] <- lapply(d1, function(x) as.Date(paste(x, 2017), "%b. %d %Y"))
head(d1)
# Start_Date End_Date
#1 2017-11-01 2017-11-07
#2 2017-11-01 2017-11-07
#3 2017-10-24 2017-11-06
#4 2017-10-04 2017-11-06
#5 2017-10-30 2017-11-06
#6 2017-10-25 2017-10-31
You may use library stringr function "str_split_fixed" to split the fields and then process the data. Map the library stringr and process as below:
library(stringr)
dat <- data.frame(date=c("Nov. 1-7", "Nov. 1-7", "Oct. 24-Nov. 6", "Oct. 4-Nov. 6",
"Oct. 30-Nov. 6", "Oct. 25-31", "Oct. 7-27", "Oct. 21-Nov. 3",
"Oct. 20-24", "Jul. 19", "Oct. 29-Nov. 4", "Oct. 28-Nov. 3",
"Oct. 27-Nov. 2", "Oct. 20-28", "Sep. 30-Oct. 20", "Oct. 15-19",
"Oct. 26-Nov. 1", "Oct. 25-31", "Oct. 24-30", "Oct. 18-26",
"Oct. 10-14", "Oct. 4-9", "Sep. 23-Oct. 6", "Sep. 16-29", "Sep. 2-22",
"Oct. 21-Nov. 2", "Oct. 17-25", "Sep. 30-Oct. 13", "Sep. 27-Oct. 3",
"Sep. 21-26", "Sep. 14-20", "Aug. 26-Sep. 15", "Sep. 7-13",
"Aug. 19-Sep. 8", "Aug. 31-Sep. 6", "Aug. 12-Sep. 1", "Aug. 9-Sep. 1",
"Aug. 24-30", "Aug. 5-25", "Aug. 17-23", "Jul. 29-Aug. 18",
"Aug. 10-16", "Jan. 12"))
Output processing:
#spliting with space and dash
dt <- data.frame(str_split_fixed(dat$date, "[-]|\\s",4))
names(dt) <- c("stdt1","stdt2","endt1","endt2")
##Removing dot(.) and replacing with ""
dt1 <- data.frame(sapply(dt,function(x)gsub("[.]","",x)))
dt1$stdt <- as.Date(paste0(dt1$stdt2,dt1$stdt1,"2016"),format="%d%b%Y")
dt1$endt <- ifelse(dt1$endt2=="",paste0(dt1$endt1,dt1$stdt1,"2016"),
paste0(dt1$endt2,dt1$endt1,"2016"))
dt1$endt <-as.Date(ifelse(nchar(dt1$endt)==7,paste0(dt1$stdt2,dt1$endt),dt1$endt),"%d%b%Y")
Assumptions:
1) No year provided , hence I have taken year as 2016.
2) On 10th row and 43rd row, there is no info on end date "day",hence I have assumed the same day as start date.
Answer:
> dt1
stdt1 stdt2 endt1 endt2 stdt endt
1 Nov 1 7 2016-11-01 2016-11-07
2 Nov 1 7 2016-11-01 2016-11-07
3 Oct 24 Nov 6 2016-10-24 2016-11-06
4 Oct 4 Nov 6 2016-10-04 2016-11-06
5 Oct 30 Nov 6 2016-10-30 2016-11-06
6 Oct 25 31 2016-10-25 2016-10-31
7 Oct 7 27 2016-10-07 2016-10-27
8 Oct 21 Nov 3 2016-10-21 2016-11-03
9 Oct 20 24 2016-10-20 2016-10-24
10 Jul 19 2016-07-19 2016-07-19

Resources