Pivot Wider causing issues when as.yearmon is used - r

I have the following code:
library(zoo)
library(xts)
df1<-structure(list(Date = structure(c(13523, 13532, 13539, 13551,
13565, 13567, 13579, 13588, 13600, 13607, 13616, 13628, 13637,
13656, 13658, 13670, 13686, 13691, 13698, 13705, 13721, 13735,
13768, 13770, 13783, 13789, 13797, 13811, 13819, 13824, 13838,
13846, 13852, 13860), class = "Date"), Category = c("Type 1",
"Type 2", "Type 1", "Type 1", "Type 1", "Type 2", "Type 1", "Type 3",
"Type 1", "Type 1", "Type 2", "Type 1", "Type 1", "Type 1", "Type 2",
"Type 1", "Type 3", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2",
"Type 1", "Type 3", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2",
"Type 1", "Type 1", "Type 1", "Type 3", "Type 2"), Value = c(2250,
1200, 625, 2250, 1000, 2750, 2250, 2750, 950, 2000, 1100, 950,
2250, 1000, 2500, 2250, 2500, 1000, 2250, 1200, 700, 2500, 2000,
2500, 900, 2250, 1200, 925, 2500, 2250, 750, 2000, 2500, 950)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -34L), groups = structure(list(
Date = structure(c(13523, 13532, 13539, 13551, 13565, 13567,
13579, 13588, 13600, 13607, 13616, 13628, 13637, 13656, 13658,
13670, 13686, 13691, 13698, 13705, 13721, 13735, 13768, 13770,
13783, 13789, 13797, 13811, 13819, 13824, 13838, 13846, 13852,
13860), class = "Date"), .rows = structure(list(1L, 2L, 3L,
4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L, 14L, 15L,
16L, 17L, 18L, 19L, 20L, 21L, 22L, 23L, 24L, 25L, 26L,
27L, 28L, 29L, 30L, 31L, 32L, 33L, 34L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -34L), .drop = TRUE))
I've created a rolling_sum by month for this particular dataset using:
df_month <- df1 %>%
group_by(Category, Month = format(Date, "%Y-%m-%d")) %>%
summarize(Rolling_Sum = sum(Value))
df_month$Month <- as.yearmon(df_month$Month)
In preparation for a conversion to an xts format I'd like to pivot-wider and replace all null/NAs values with 0. However the pivot-wider seems to break the dataset, making the null replacement and xts conversion impossible:
df_turned <- df_month %>% group_by(Category) %>% pivot_wider(names_from = Category, values_from = Rolling_Sum, id_cols = Month)
If that had worked, I would have done:
df_turned <- df_turned %>% replace(.=="NULL", 0)
Then:
df_turned <- xts(df_turned, order.by = df_turned$Month)
Any advice most appreciated.

If we don't want duplicates, then use values_fn
library(tidyr)
library(dplyr)
df_turned <- df_month %>%
ungroup %>%
pivot_wider(names_from = Category, values_from = Rolling_Sum,
values_fn = sum, values_fill = 0)
-output
df_turned
# A tibble: 12 × 4
Month `Type 1` `Type 2` `Type 3`
<yearmon> <dbl> <dbl> <dbl>
1 Jan 2007 2875 1200 0
2 Feb 2007 3250 2750 0
3 Mar 2007 3200 0 2750
4 Apr 2007 2950 1100 0
5 May 2007 3250 2500 0
6 Jun 2007 3250 0 2500
7 Jul 2007 4150 0 0
8 Sep 2007 2900 0 2500
9 Oct 2007 4375 0 0
10 Nov 2007 5000 2500 0
11 Aug 2007 0 2500 0
12 Dec 2007 0 950 2500
Now, we can convert to xts
xts(df_turned[-1], order.by = df_turned$Month)
Type 1 Type 2 Type 3
Jan 2007 2875 1200 0
Feb 2007 3250 2750 0
Mar 2007 3200 0 2750
Apr 2007 2950 1100 0
May 2007 3250 2500 0
Jun 2007 3250 0 2500
Jul 2007 4150 0 0
Aug 2007 0 2500 0
Sep 2007 2900 0 2500
Oct 2007 4375 0 0
Nov 2007 5000 2500 0
Dec 2007 0 950 2500

As indicated in my comment, your problem is that you create duplicates because as.yearmon is called after the grouping by "Month". You are de facto grouping by "Date". We could do:
library(dplyr)
library(tidyr)
df1 |>
group_by(Category,
Month = as.yearmon(Date)) |>
pivot_wider(names_from = Category,
values_from = Value,
values_fn = sum,
values_fill = 0
) |>
select(-Date) # Or mutate "Date" above instead of creating "Month".
Then call xts.
Month = as.yearmon(Date) shouldn't cause a problem if Date is a date-type. However, if it is causing trouble as you indicate in your comment, as.yearmon(format(Date, "%Y-%m-%d")).
Output:
# A tibble: 12 × 4
Month `Type 1` `Type 2` `Type 3`
<yearmon> <dbl> <dbl> <dbl>
1 Jan 2007 2875 1200 0
2 Feb 2007 3250 2750 0
3 Mar 2007 3200 0 2750
4 Apr 2007 2950 1100 0
5 May 2007 3250 2500 0
6 Jun 2007 3250 0 2500
7 Jul 2007 4150 0 0
8 Sep 2007 2900 0 2500
9 Oct 2007 4375 0 0
10 Nov 2007 5000 2500 0
11 Aug 2007 0 2500 0
12 Dec 2007 0 950 2500
Update After #akrun updated answer with a similar solution, my solution seems more verbose. The reason is that my approach works directly on the df1 object and solves the problem there.

Use read.zoo like this:
library(zoo)
df_month |>
read.zoo(index = "Month", split = "Category", aggregate = sum) |>
na.fill(0)
giving this zoo object -- as.xts can be used to convert that to xts if needed.
Type 1 Type 2 Type 3
Jan 2007 2875 1200 0
Feb 2007 3250 2750 0
Mar 2007 3200 0 2750
Apr 2007 2950 1100 0
May 2007 3250 2500 0
Jun 2007 3250 0 2500
Jul 2007 4150 0 0
Aug 2007 0 2500 0
Sep 2007 2900 0 2500
Oct 2007 4375 0 0
Nov 2007 5000 2500 0
Dec 2007 0 950 2500
or directly from df1 modified from the comment below
df1 |>
read.zoo(df1, FUN = as.yearmon, split = "Category", aggregate = sum) |>
na.fill(0)
Note
df_month from question in immediately reproducible form
df_month <-
structure(list(Category = c("Type 1", "Type 1", "Type 1", "Type 1",
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1",
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 1",
"Type 1", "Type 1", "Type 1", "Type 1", "Type 1", "Type 2", "Type 2",
"Type 2", "Type 2", "Type 2", "Type 2", "Type 2", "Type 3", "Type 3",
"Type 3", "Type 3"), Month = structure(c(2007, 2007, 2007.08333333333,
2007.08333333333, 2007.16666666667, 2007.16666666667, 2007.25,
2007.25, 2007.33333333333, 2007.33333333333, 2007.41666666667,
2007.41666666667, 2007.5, 2007.5, 2007.5, 2007.66666666667, 2007.66666666667,
2007.75, 2007.75, 2007.75, 2007.83333333333, 2007.83333333333,
2007.83333333333, 2007, 2007.08333333333, 2007.25, 2007.33333333333,
2007.58333333333, 2007.83333333333, 2007.91666666667, 2007.16666666667,
2007.41666666667, 2007.66666666667, 2007.91666666667), class = "yearmon"),
Rolling_Sum = c(2250, 625, 2250, 1000, 2250, 950, 2000, 950,
2250, 1000, 2250, 1000, 2250, 1200, 700, 2000, 900, 2250,
1200, 925, 2250, 750, 2000, 1200, 2750, 1100, 2500, 2500,
2500, 950, 2750, 2500, 2500, 2500)), row.names = c(NA, -34L
), groups = structure(list(Category = c("Type 1", "Type 2", "Type 3"
), .rows = structure(list(1:23, 24:30, 31:34), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -3L), .drop = TRUE), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"))

Related

Is there a way to plot correlation heatmap between two dataframes in R? The two dataframes have different row names and are of unequal dimesions

I have two different dataframes as shown in figures attached. Dataframe1 and Dataframe2.
This is what I tried.
#First dataframe
structure(list(Label = c("Gene 1", "Gene 2", "Gene 3", "Gene 4",
"Gene 5", "Gene 6", "Gene 7", "Gene 8", "Gene 9", "Gene 10",
"Gene 11", "Gene 12", "Gene 13", "Gene 14", "Gene 15", "Gene 16",
"Gene 17", "Gene 18", "Gene 19", "Gene 20", "Gene 21", "Gene 22",
"Gene 23", "Gene 24", "Gene 25", "Gene 26", "Gene 27", "Gene 28",
"Gene 29", "Gene 30"), Count = c(1500, 1600, 1700, 1800, 1900,
2000, 2100, 2200, 2300, 2400, 2500, 2600, 2700, 2800, 2900, 3000,
3100, 3200, 3300, 3400, 3500, 3600, 3700, 3800, 3900, 4000, 4100,
4200, 4300, 4400)), class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA,
-30L))
df_1 <- read_excel("Demo_data.xlsx", sheet = "Dataframe1")
str(df_1)
View(df_1)
df_1.1 <- column_to_rownames(df_1, 'Label')
View(df_1.1)
df_1.2 <- t(df_1.1)
View(df_1.2)
df_1.2 <- as.data.frame(df_1.2)
str(df_1.2)
typeof(dff1)
str(dff1)
#Second dataframe
structure(list(Label = c("Control1", "Control2", "Control3",
"Control4", "Control5", "Control6", "Control7", "Control8", "Control9",
"Control10", "Control11", "Control12", "Control13", "Control14",
"Control15", "Control16", "Control17", "Control18", "Control19",
"Control20", "Control21", "Control22", "Control23", "Control24"
), Count = c(1800, 1400, 1110, 1900, 2500, 2900, 2100, 900, 5000,
2300, 700, 1400, 3400, 2310, 3322, 2200, 4400, 2100, 1000, 6700,
4300, 2120, 4800, 4300)), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -24L))
df_2 <- read_excel("Demo_data.xlsx", sheet = "Dataframe2")
df_2.1 <- column_to_rownames(df_2, 'Label')
View(df_2.1)
df_2.1 <- t(df_2.1)
View(df_2.1)
df_2.1 <- as.data.frame(df_2.1)
str(df_2.1)
correlation <- cor(df_1.2, df_2.1)
View(correlation)
This is my desired output but I am getting NA for every correlation. Any help is highly appreciated.
Desired output (without NA)
As it is written in the comments, what you are trying to achieve is rather unclear.
If you want to compute the correlation between the Count column in each dataframe and visualize it using a scatterplot, you can use the following code:
library(tidyverse)
df_1 <- structure(list(Label = c("Gene 1", "Gene 2", "Gene 3", "Gene 4",
"Gene 5", "Gene 6", "Gene 7", "Gene 8", "Gene 9", "Gene 10",
"Gene 11", "Gene 12", "Gene 13", "Gene 14", "Gene 15", "Gene 16",
"Gene 17", "Gene 18", "Gene 19", "Gene 20", "Gene 21", "Gene 22",
"Gene 23", "Gene 24", "Gene 25", "Gene 26", "Gene 27", "Gene 28",
"Gene 29", "Gene 30"),
Count = c(1500, 1600, 1700, 1800, 1900, 2000, 2100, 2200, 2300, 2400, 2500,
2600, 2700, 2800, 2900, 3000, 3100, 3200, 3300, 3400, 3500, 3600,
3700, 3800, 3900, 4000, 4100, 4200, 4300, 4400)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -30L))
df_2 <- structure(list(Label = c("Control1", "Control2", "Control3",
"Control4", "Control5", "Control6", "Control7", "Control8", "Control9",
"Control10", "Control11", "Control12", "Control13", "Control14",
"Control15", "Control16", "Control17", "Control18", "Control19",
"Control20", "Control21", "Control22", "Control23", "Control24"),
Count = c(1800, 1400, 1110, 1900, 2500, 2900, 2100, 900, 5000, 2300, 700, 1400,
3400, 2310, 3322, 2200, 4400, 2100, 1000, 6700, 4300, 2120, 4800, 4300)),
class = c("tbl_df", "tbl", "data.frame"), row.names = c(NA, -24L))
dat = left_join(
df_1 %>% mutate(id=str_extract(Label, "\\d+")),
df_2 %>% mutate(id=str_extract(Label, "\\d+")),
by="id", suffix=c("_gene", "_ctl")
)
dat
#> # A tibble: 30 x 5
#> Label_gene Count_gene id Label_ctl Count_ctl
#> <chr> <dbl> <chr> <chr> <dbl>
#> 1 Gene 1 1500 1 Control1 1800
#> 2 Gene 2 1600 2 Control2 1400
#> 3 Gene 3 1700 3 Control3 1110
#> 4 Gene 4 1800 4 Control4 1900
#> 5 Gene 5 1900 5 Control5 2500
#> 6 Gene 6 2000 6 Control6 2900
#> 7 Gene 7 2100 7 Control7 2100
#> 8 Gene 8 2200 8 Control8 900
#> 9 Gene 9 2300 9 Control9 5000
#> 10 Gene 10 2400 10 Control10 2300
#> # ... with 20 more rows
cor(dat$Count_gene, dat$Count_ctl, use="pairwise.complete.obs")
#> [1] 0.5047392
ggplot(dat, aes(x=Count_gene, y=Count_ctl)) +
geom_point()
#> Warning: Removed 6 rows containing missing values (`geom_point()`).
Created on 2022-12-12 with reprex v2.0.2
Basically, I extracted the id as the last digits of the label, then used left_join() to merge the dataframes.
This might look overly complicated but it is always a good idea to keep your data tidy in one dataframe.
Note that in your example, df_2 stops at id==24 so the correlation is computed on the 24 complete observations only.
However, a correlation is computed across 2 vectors, so in order to have a heatmap you would need a set of many vectors, which you don't seem to have.
For your next questions, it would be great if you use the reprex package as I did in this answer.

Trouble fitting `glmer()` and `gamm4()` models on uncomplicated data set - Is it a computer speed or complexity issue?

I am having issues with computing a GLMM in R using the lme4 package. The issue is that it is attempting to build the model but the loading stays in a suspended state (The stop sign button just remains on). Surely it can't be because of the "complexity" of my model or the data set. My full data set is under 20,000 observations. I fit a less complex model to the data and it produced a result without issue:
binary_female_model_3_5 = glmer(data = binary_female_data, formula = sex ~ device_name + (1| province),
family = binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)))
For reference our professor gave us the code control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)) to add to our models if we have "convergence issues". It is not a concept that I fully understand and it isn't a part of the content of what I'm doing it was given as a fix just in case.
I am at a loss for what I could do. I attempted using a gam() and gamm4(), but I have not really learned anything on using those models and that package yet so I feel uncomfortable trying to mess with it on this constricted schedule. I posted part of the data set below. All the data is made up so it is of no concern to be shown.
The model I am attempting to fit:
binary_female_model_4 = glmer(data = binary_female_data, formula = sex ~ device_name + (device_name| province),
family = binomial, control=glmerControl(optimizer="bobyqa", optCtrl=list(maxfun=2e5)))
The data:
dput(head(binary_female_data, 15))
structure(list(cust_id = c("0000af7e-418a-402d-a7c4-0dfe2d7eafcb",
"00084c10-8e0f-4ab9-a9b8-2e5eef964f60", "00092283-3131-44a5-be65-c802a6a38fd4",
"000bbf08-551c-41c3-8ae2-c136d33e721d", "00128697-9e88-44ce-b57f-5859cff6bc09",
"0014164a-f65a-4324-b58f-11f3576a5f54", "00146c9e-16c3-43ae-b2e7-b2d56f9343e8",
"0014b3d9-7f56-4321-b2b7-9f06bab0dac2", "00153635-09d2-4500-8513-0d50d87a8110",
"00169e8f-f855-4bbf-8138-7f5a1fdd1991", "001ae3ef-da49-4ce6-bfc9-71d119ec8f03",
"001fa5b5-03e0-401c-9291-0a60c4d5a846", "0020d464-ed55-4d28-8719-1d473e74e57c",
"0020e97e-7868-4179-a79d-b3339c1f9cd2", "0023e3e9-4742-4e54-8e27-5b243b4c1b50"
), dob = structure(c(-985, -1690, 4197, -9358, 8216, 198, 1850,
-225, -3407, 6928, -7278, 3995, -14041, -3981, 10678), class = "Date"),
age = c(55, 57, 41, 78, 30, 52, 47, 53, 62, 34, 72, 42, 91,
63, 23), province = c("Quebec", "Alberta", "Alberta", "Ontario",
"Ontario", "Ontario", "Alberta", "British Columbia", "Alberta",
"Ontario", "Quebec", "Quebec", "Ontario", "Ontario", "Ontario"
), sex = c(FALSE, TRUE, FALSE, FALSE, FALSE, TRUE, TRUE,
FALSE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE), pronouns = c("he/him",
"she/her", "he/him", "he/him", "he/him", "she/her", "she/her",
"he/him", "he/him", "she/her", "she/her", "she/her", "she/her",
"she/her", "she/her"), hhld_median_inc = c(59881, 87225,
97334, 65829, 65829, 65829, 97334, 65241, 97334, 83018, 50227,
50227, 65829, 87290, 85981), age_grp = c("51 , 64", "51 , 64",
"41 , 50", "65 +", "24 , 32", "51 , 64", "41 , 50", "51 , 64",
"51 , 64", "33 , 40", "65 +", "41 , 50", "65 +", "51 , 64",
"18 , 23"), emoji_modifier = structure(c("U+1F3FF", NA, NA,
"U+1F3FD", "U+1F3FF", NA, "U+1F3FC", NA, "U+1F3FC", "U+1F3FB",
"U+1F3FF", "U+1F3FD", NA, "U+1F3FB", "U+1F3FD"), .Names = c("",
"", "", "", "", "", "", "", "", "", "", "", "", "", "")),
skin_tone = c("dark", NA, NA, "medium", "dark", NA, "medium-light",
NA, "medium-light", "light", "dark", "medium", NA, "light",
"medium"), dev_id = c("mg-2020-909", "mg-2020-909", "mg-2021-960",
"mg-2021-960", "mg-2021-987", "mg-2021-987", "mg-2021-960",
"mg-2020-909", "mg-2021-960", "mg-2021-987", "mg-2020-658",
"mg-2020-909", "mg-2021-987", "mg-2021-960", "mg-2021-960"
), device_name = c("Run BE", "Run BE", "Advance 2", "Advance 2",
"Run ON", "Run ON", "Advance 2", "Run BE", "Advance 2", "Run ON",
"Active Alpha", "Run BE", "Run ON", "Advance 2", "Advance 2"
), line = c("Run", "Run", "Advance", "Advance", "Run", "Run",
"Advance", "Run", "Advance", "Run", "Active", "Run", "Run",
"Advance", "Advance"), released = structure(c(18586, 18586,
18816, 18816, 18965, 18965, 18816, 18586, 18816, 18965, 18626,
18586, 18965, 18816, 18816), class = "Date"), recommended_retail_price = c(299.99,
299.99, 145, 145, 349.99, 349.99, 145, 299.99, 145, 349.99,
99.99, 299.99, 349.99, 145, 145)), class = c("grouped_df",
"tbl_df", "tbl", "data.frame"), row.names = c(NA, -15L), groups = structure(list(
cust_id = c("0000af7e-418a-402d-a7c4-0dfe2d7eafcb", "00084c10-8e0f-4ab9-a9b8-2e5eef964f60",
"00092283-3131-44a5-be65-c802a6a38fd4", "000bbf08-551c-41c3-8ae2-c136d33e721d",
"00128697-9e88-44ce-b57f-5859cff6bc09", "0014164a-f65a-4324-b58f-11f3576a5f54",
"00146c9e-16c3-43ae-b2e7-b2d56f9343e8", "0014b3d9-7f56-4321-b2b7-9f06bab0dac2",
"00153635-09d2-4500-8513-0d50d87a8110", "00169e8f-f855-4bbf-8138-7f5a1fdd1991",
"001ae3ef-da49-4ce6-bfc9-71d119ec8f03", "001fa5b5-03e0-401c-9291-0a60c4d5a846",
"0020d464-ed55-4d28-8719-1d473e74e57c", "0020e97e-7868-4179-a79d-b3339c1f9cd2",
"0023e3e9-4742-4e54-8e27-5b243b4c1b50"), .rows = structure(list(
1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L, 10L, 11L, 12L, 13L,
14L, 15L), ptype = integer(0), class = c("vctrs_list_of",
"vctrs_vctr", "list"))), class = c("tbl_df", "tbl", "data.frame"
), row.names = c(NA, -15L), .drop = TRUE))
If somebody knowledgeable would be gracious enough to help.
EDIT: the 3 calls requested by Gregor:
> table(binary_female_data$sex)
FALSE TRUE
7966 11091
> table(binary_female_data$device_name)
Active Active Alpha Active HR Advance Advance 2 iDOL Run Run 7 Run 7 Plus
279 1763 299 2044 6191 158 3 43 72
Run 875 Run 875 X Run BE Run HYYH Run Leader Run ON
631 77 3736 1 2 3758
> with(binary_female_data, table(device_name, province))
province
device_name Alberta British Columbia Manitoba New Brunswick Newfoundland and Labrador Nova Scotia Ontario
Active 27 18 10 2 0 6 140
Active Alpha 62 119 80 5 1 40 885
Active HR 33 27 11 2 1 7 137
Advance 242 147 99 2 3 36 1104
Advance 2 1004 340 164 15 9 93 2807
iDOL 31 11 5 0 0 2 79
Run 0 0 0 0 0 0 0
Run 7 0 2 0 0 0 0 36
Run 7 Plus 28 1 0 0 0 0 27
Run 875 156 29 23 0 0 14 337
Run 875 X 9 5 2 0 0 2 44
Run BE 678 258 130 6 3 66 1961
Run HYYH 0 0 0 0 0 0 0
Run Leader 0 0 0 0 0 0 0
Run ON 703 242 165 8 2 54 1952
province
device_name Prince Edward Island Quebec Saskatchewan Yukon
Active 0 75 1 0
Active Alpha 0 554 17 0
Active HR 2 78 1 0
Advance 1 397 13 0
Advance 2 1 1689 68 1
iDOL 0 26 4 0
Run 0 3 0 0
Run 7 0 4 1 0
Run 7 Plus 0 14 1 1
Run 875 0 64 8 0
Run 875 X 0 14 1 0
Run BE 1 592 41 0
Run HYYH 0 1 0 0
Run Leader 0 2 0 0
Run ON 0 595 37 0

Multiply numeric columns of two different dataframes based on matching of other columns

I have the first dataframe like:
df1<-structure(list(`Demand Per Section` = c(80, 125, 350, 100, 538,
75, 25, 138, 138, 75, 150, 37, 225, 35, 40, 125, 25, 25, 125,
50), `Element Name` = c("Naphthalene", "Nitric acid (concentrated)",
"Sulphuric acid(concentrated)", "2-hydroxybenzoic acid", "Acetic anhydride",
"2-Naphthol", "Sodium Hydroxide", "Phenyl hydrazine hydrochloride",
"Glucose", "Sodium acetate", "Aniline", "Zinc poweder", "2-amino-benzoic acid",
"1.3-dihydroxybenzene", "Ethyl acetate", "hydroxy benzene", "phenyl methanol",
"Sodium carbonate", "Potassium permanganate", "Sodium bisulfite."
), `Course Name` = c("Course 1", "Course 1", "Course 1", "Course 1",
"Course 1", "Course 1", "Course 1", "Course 1", "Course 1", "Course 1",
"Course 1", "Course 1", "Course 1", "Course 1", "Course 1", "Course 1",
"Course 1", "Course 1", "Course 1", "Course 1"), Department = c("Chemsitry",
"Chemsitry", "Chemsitry", "Chemsitry", "Chemsitry", "Chemsitry",
"Chemsitry", "Chemsitry", "Chemsitry", "Chemsitry", "Chemsitry",
"Chemsitry", "Chemsitry", "Chemsitry", "Chemsitry", "Chemsitry",
"Chemsitry", "Chemsitry", "Chemsitry", "Chemsitry")), row.names = c(NA,
-20L), class = c("tbl_df", "tbl", "data.frame"), na.action = structure(c(`81` = 81L,
`101` = 101L, `127` = 127L, `134` = 134L, `135` = 135L, `136` = 136L,
`174` = 174L, `183` = 183L, `220` = 220L, `225` = 225L, `245` = 245L,
`286` = 286L, `288` = 288L, `290` = 290L, `305` = 305L, `314` = 314L,
`324` = 324L, `329` = 329L), class = "omit"))
`Demand Per Section` `Element Name` `Course Name` Department
<dbl> <chr> <chr> <chr>
1 80 Naphthalene Course 1 Chemsitry
2 125 Nitric acid (concentrated) Course 1 Chemsitry
3 350 Sulphuric acid(concentrated) Course 1 Chemsitry
4 100 2-hydroxybenzoic acid Course 1 Chemsitry
5 538 Acetic anhydride Course 1 Chemsitry
6 75 2-Naphthol Course 1 Chemsitry
7 25 Sodium Hydroxide Course 1 Chemsitry
8 138 Phenyl hydrazine hydrochloride Course 1 Chemsitry
9 138 Glucose Course 1 Chemsitry
10 75 Sodium acetate Course 1 Chemsitry
11 150 Aniline Course 1 Chemsitry
12 37 Zinc poweder Course 1 Chemsitry
13 225 2-amino-benzoic acid Course 1 Chemsitry
14 35 1.3-dihydroxybenzene Course 1 Chemsitry
15 40 Ethyl acetate Course 1 Chemsitry
16 125 hydroxy benzene Course 1 Chemsitry
17 25 phenyl methanol Course 1 Chemsitry
18 25 Sodium carbonate Course 1 Chemsitry
and a second dataframe like:
df2<-structure(list(`Course name` = c("Course 1", "Course 2", "Course 3",
"Course 4", "Course 1", "Course 2", "Course 3", "Course 4", "Course 5",
"Course 1", "Course 2", "Course 3", "Course 4", "Course 5"),
`number of sections` = c(3, 5, 3, 4, 7, 2, 7, 10, 3, 4, 5,
6, 2, 2), Department = c("Chemsitry", "Chemsitry", "Chemsitry",
"Chemsitry", "Biology", "Biology", "Biology", "Biology",
"Biology", "Physics", "Physics", "Physics", "Physics", "Physics"
)), row.names = c(NA, -14L), class = c("tbl_df", "tbl", "data.frame"
))
`Course name` `number of sections` Department
<chr> <dbl> <chr>
1 Course 1 3 Chemsitry
2 Course 2 5 Chemsitry
3 Course 3 3 Chemsitry
4 Course 4 4 Chemsitry
5 Course 1 7 Biology
6 Course 2 2 Biology
7 Course 3 7 Biology
8 Course 4 10 Biology
9 Course 5 3 Biology
10 Course 1 4 Physics
11 Course 2 5 Physics
12 Course 3 6 Physics
13 Course 4 2 Physics
14 Course 5 2 Physics
What I want is to create a new column in df1 named DemandCourse which will look into df2 in the columns of Course Name and Department and when both will match with Course Name and Department of df1 it will multiply the Demand per Section of df1 with the number of sections of df2. So for example the first row of the new column will be 80*3 =240
We may need to join (left_join) and then mutate to create the column
library(dplyr)
df1 <- left_join(df1, df2,
by = c("Course Name" = "Course name", "Department")) %>%
mutate(DemandCourse = `Demand Per Section` * `number of sections`,
.keep = "unused")
-output
> df1
# A tibble: 20 × 4
`Element Name` `Course Name` Department DemandCourse
<chr> <chr> <chr> <dbl>
1 Naphthalene Course 1 Chemsitry 240
2 Nitric acid (concentrated) Course 1 Chemsitry 375
3 Sulphuric acid(concentrated) Course 1 Chemsitry 1050
4 2-hydroxybenzoic acid Course 1 Chemsitry 300
5 Acetic anhydride Course 1 Chemsitry 1614
6 2-Naphthol Course 1 Chemsitry 225
7 Sodium Hydroxide Course 1 Chemsitry 75
8 Phenyl hydrazine hydrochloride Course 1 Chemsitry 414
9 Glucose Course 1 Chemsitry 414
10 Sodium acetate Course 1 Chemsitry 225
11 Aniline Course 1 Chemsitry 450
12 Zinc poweder Course 1 Chemsitry 111
13 2-amino-benzoic acid Course 1 Chemsitry 675
14 1.3-dihydroxybenzene Course 1 Chemsitry 105
15 Ethyl acetate Course 1 Chemsitry 120
16 hydroxy benzene Course 1 Chemsitry 375
17 phenyl methanol Course 1 Chemsitry 75
18 Sodium carbonate Course 1 Chemsitry 75
19 Potassium permanganate Course 1 Chemsitry 375
20 Sodium bisulfite. Course 1 Chemsitry 150

In dplyr group_by() + summarise(sum)is not working

This is my code:
df <- structure(list(NOME = c("JOGADOR 1", "JOGADOR 1", "JOGADOR 6",
"JOGADOR 6", "JOGADOR 5", "JOGADOR 5", "JOGADOR 3", "JUGADOR 3",
"JOGADOR 9", "JOGADOR 9", "JOGADOR 7", "JOGADOR 7", "JOGADOR 8",
"JOGADOR 8", "JOGADOR 10", "JOGADOR 10", "JOGADOR 4", "JOGADOR 4",
"JOGADOR 2", "JOGADOR 2", "JOGADOR 12", "JOGADOR 11", "JOGADOR 13"
), TOTAL_MINUTES = c(48.15, 43, 48.15, 51.9333333333333, 48.15,
51.9333333333333, 48.15, 51.9333333333333, 48.15, 25, 48.15,
51.9333333333333, 48.15, 29, 48.15, 42, 48.15, 51.9333333333333,
48.15, 51.9333333333333, 17, 26, 9), TOTAL.DISTANCE = c(5264L,
3999L, 5242L, 5589L, 5684L, 5966L, 4833L, 5012L, 5013L, 2653L,
5452L, 5691L, 5041L, 3775L, 5266L, 4321L, 4795L, 4924L, 5209L,
5242L, 2085L, 2703L, 1282L)), row.names = c(NA, -23L), class = c("tbl_df",
"tbl", "data.frame"))
Its a simple task but its not working:
df %>%group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE),sum())
It just reapting the NOME column values. Its not summing and giving one line per "JOGADOR X".
Why? Any help?
The across was closed without the sum. Also, if we are not providing any lambda expression, we don't use sum()
library(dplyr)
df %>%
group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE, sum,
na.rm = TRUE), .groups = 'drop')
-output
# A tibble: 14 × 3
NOME TOTAL_MINUTES TOTAL.DISTANCE
<chr> <dbl> <int>
1 JOGADOR 1 91.2 9263
2 JOGADOR 10 90.2 9587
3 JOGADOR 11 26 2703
4 JOGADOR 12 17 2085
5 JOGADOR 13 9 1282
6 JOGADOR 2 100. 10451
7 JOGADOR 3 48.2 4833
8 JOGADOR 4 100. 9719
9 JOGADOR 5 100. 11650
10 JOGADOR 6 100. 10831
11 JOGADOR 7 100. 11143
12 JOGADOR 8 77.2 8816
13 JOGADOR 9 73.2 7666
14 JUGADOR 3 51.9 5012
Or using lambda expression
df %>%
group_by(NOME) %>%
summarise(across(TOTAL_MINUTES:TOTAL.DISTANCE, ~sum(.x,
na.rm = TRUE)), .groups = 'drop')

Formatting grouped data for tables in R

I'm trying to display my data in table format and I can't figure out how to rearrange my data to display it in the proper format. I'm used to wrangling data for plots, but I'm finding myself a little lost when it comes to preparing tables. This seems like something really basic, but I haven't been able to find an explanation on what I'm doing wrong here.
I have 3 columns of data, Type, Year, and n. The data formatted as it is now produces a table that looks like this:
Type Year n
Type C 1 5596
Type D 1 1119
Type E 1 116
Type A 1 402
Type F 1 1614
Type B 1 105
Type C 2 26339
Type D 2 14130
Type E 2 98
Type A 2 3176
Type F 2 3071
Type B 2 88
What I want to do is to have Type as row names, Year as column names, and n populating the table contents like this:
1 2
Type A 402 3176
Type B 105 88
Type C 26339 5596
Type D 1119 14130
Type E 116 98
Type F 1614 3071
The mistake might have been made upstream from this point. Using the full original data set I arrived at this output by doing the following:
exampletable <- df %>%
group_by(Year) %>%
count(Type) %>%
select(Type, Year, n)
Here is the dput() output
structure(list(Type = c("Type C", "Type D", "Type E", "Type A",
"Type F", "Type B", "Type C", "Type D", "Type E", "Type A", "Type F",
"Type B", "Type C", "Type D", "Type E", "Type A", "Type F", "Type B",
"Type C", "Type D", "Type E", "Type A", "Type F", "Type B", "Type C",
"Type D", "Type E"), Year = c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
2, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 5, 5, 5), n = c(5596,
1119, 116, 402, 1614, 105, 26339, 14130, 98, 3176, 3071, 88,
40958, 17578, 104, 3904, 3170, 102, 33145, 23800, 93, 1264, 7084,
1262, 34642, 24911, 504)), class = c("spec_tbl_df", "tbl_df",
"tbl", "data.frame"), row.names = c(NA, -27L), spec = structure(list(
cols = list(Type = structure(list(), class = c("collector_character",
"collector")), Year = structure(list(), class = c("collector_double",
"collector")), n = structure(list(), class = c("collector_double",
"collector"))), default = structure(list(), class = c("collector_guess",
"collector")), skip = 1), class = "col_spec"))
You can get the data in wide format and change Type column to rowname.
tidyr::pivot_wider(df, names_from = Year, values_from = n) %>%
tibble::column_to_rownames('Type')
# 1 2 3 4 5
#Type C 5596 26339 40958 33145 34642
#Type D 1119 14130 17578 23800 24911
#Type E 116 98 104 93 504
#Type A 402 3176 3904 1264 NA
#Type F 1614 3071 3170 7084 NA
#Type B 105 88 102 1262 NA
You can use tidyr package to get to wider format and tibble package to convert a column to rownames
dataset <- read.csv(file_location)
dataset <- tidyr::pivot_wider(dataset, names_from = Year, values_from = n)
tibble::column_to_rownames(dataset, var = 'Type')
1 2
Type C 5596 26339
Type D 1119 14130
Type E 116 98
Type A 402 3176
Type F 1614 3071
Type B 105 88

Resources